Python Retriever.filename Beispiele

Programmiersprache: Python

Namespace / Paketname: retriever

Klasse / Typ: Retriever

Methode / Funktion: filename

Beispiele auf hotexamples.com: 3

Python Retriever.filename - 3 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die retriever.Retriever.filename, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Retriever(28)

run_all_queries(6)

from_pretrained(4)

get_corpus(3)

start(2)

fetch(2)

filename(2)

filter_by_pos(1)

get_venue_by_venue_type(1)

checkAlert(1)

retrieve_books(1)

retrieve(1)

render(1)

rank(1)

process_response(1)

mine(1)

match_location(1)

latest_two_filenames(1)

kill(1)

join(1)

get_venue_by_food(1)

get_venue_by_food_venue_type(1)

filter_relative(1)

get_total_corpus(1)

get_titles_from_query(1)

build_index(1)

get_similar_venue_by_name(1)

get_random_venue(1)

get_random_similar_stmt(1)

get_paragraphs_from_documents(1)

get_n_words(1)

get_contents_from_title(1)

getLinks(1)

findClosestMatch(1)

get_similar_venue_by_review(1)

Beispiel #1

Datei anzeigen

Datei: linkanalyser.py Projekt: satyasashi/Web-Crawler

class LinkAnalyzer(object):
    """creates abstract documents and feeds their attributes
   """
    def __init__(self):
        shelve('database1', 'c')
        self.term_extractor = parser.ExtractTerms()
        self.retriever = Retriever()

    def analyze(self, url, links):
        """creates a document and sets its outgoing links
      """
        self.db = shelve('database1', 'w')
        key = md5(url).hexdigest()
        #if the document is already in the database, just add its outgoing links

        if key in self.db.iterkeys():
            doc = self.db[key]
            doc.insertOL(links)
            doc.url = url
            document = open(self.retriever.filename(url)).read()
            doc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            doc.unique_terms_freq = self.term_extractor.count_term_frequencies(
                unique_terms, document)
            #print self.db[key].outgoingLinks
        #if there is no document for the url, create a document and add its outgoing links
        if key not in self.db.iterkeys():
            newDoc = Document(url)
            newDoc.insertOL(links)
            newDoc.url = url
            document = open(self.retriever.filename(url)).read()
            newDoc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies(
                unique_terms, document)
            self.db[key] = newDoc
            #print self.db[key].outgoingLinks
        #self.extractLinksfromResponse(url,links)
        self.db.close()

    def extractLinksfromResponse(self, url, links):
        """analyses the incoming links from the response
      """
        for link in links:
            key = md5(link).hexdigest()
            if key in self.db.iterkeys():
                doc = self.db[key]
                doc.insertIL(url)
            else:
                newDo = Document(link)
                newDo.insertIL(url)
                #print type(newDo)
                #print type(key)
                self.db[key] = newDo

Beispiel #2

Datei anzeigen

Datei: linkanalyser.py Projekt: praveen97uma/Web-Crawler

class LinkAnalyzer(object):
    """creates abstract documents and feeds their attributes
   """

    def __init__(self):
        shelve("database1", "c")
        self.term_extractor = parser.ExtractTerms()
        self.retriever = Retriever()

    def analyze(self, url, links):
        """creates a document and sets its outgoing links
      """
        self.db = shelve("database1", "w")
        key = md5(url).hexdigest()
        # if the document is already in the database, just add its outgoing links

        if key in self.db.iterkeys():
            doc = self.db[key]
            doc.insertOL(links)
            doc.url = url
            document = open(self.retriever.filename(url)).read()
            doc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            doc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document)
            # print self.db[key].outgoingLinks
        # if there is no document for the url, create a document and add its outgoing links
        if key not in self.db.iterkeys():
            newDoc = Document(url)
            newDoc.insertOL(links)
            newDoc.url = url
            document = open(self.retriever.filename(url)).read()
            newDoc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document)
            self.db[key] = newDoc
            # print self.db[key].outgoingLinks
        # self.extractLinksfromResponse(url,links)
        self.db.close()

    def extractLinksfromResponse(self, url, links):
        """analyses the incoming links from the response
      """
        for link in links:
            key = md5(link).hexdigest()
            if key in self.db.iterkeys():
                doc = self.db[key]
                doc.insertIL(url)
            else:
                newDo = Document(link)
                newDo.insertIL(url)
                # print type(newDo)
                # print type(key)
                self.db[key] = newDo

Beispiel #3

Datei anzeigen

class Downloader(object):
    """There are two downloaders
    download() uses the urllib2 module
    CDownload() uses curl to download the pages. This results in fast page downloads
    """
    def __init__(self):
        self.retriever = Retriever() #to use the filename function
        self.headers = {    
          'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
          'Accept' : 'text/xml,application/xml,application/xhtml+xml,\
            text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
          'Accept-Language' : 'fr-fr,en-us;q=0.7,en;q=0.3',
          'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
        }
    
    
    def download(self,url):
        """downloads the webpage indicated by url and saves it in a 
        file with an absolute path as that of the url
        """                  
        reqObj = urllib2.Request(url, None, self.headers)
        try:
            urlObj = urllib2.urlopen(reqObj)
            response = urlObj.readlines()
        except Exception:
            return 
        #write the content of the response object to the file
        file = open(self.retriever.filename(url), 'w')
        
        for line in response:
            file.writelines(line)
        print url + "**** crawled"
        
        logging.info("* crawled %s \n"%url)
        file.close()
            
        return 1         

    def CDownload(self, url):
        try:
            file_name = self.retriever.filename(url)
            #curl downloads the file and writes it into a file
            os.system("curl %s -o %s"%(url, file_name))
            print url + "**** crawled"
            logging.info("* crawled %s \n"%url)
            response=open(file_name, "r").read()
        except IOError:
            return 0
        
        return response