Beispiel #1
0
class LinkAnalyzer(object):
    """creates abstract documents and feeds their attributes
   """
    def __init__(self):
        shelve('database1', 'c')
        self.term_extractor = parser.ExtractTerms()
        self.retriever = Retriever()

    def analyze(self, url, links):
        """creates a document and sets its outgoing links
      """
        self.db = shelve('database1', 'w')
        key = md5(url).hexdigest()
        #if the document is already in the database, just add its outgoing links

        if key in self.db.iterkeys():
            doc = self.db[key]
            doc.insertOL(links)
            doc.url = url
            document = open(self.retriever.filename(url)).read()
            doc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            doc.unique_terms_freq = self.term_extractor.count_term_frequencies(
                unique_terms, document)
            #print self.db[key].outgoingLinks
        #if there is no document for the url, create a document and add its outgoing links
        if key not in self.db.iterkeys():
            newDoc = Document(url)
            newDoc.insertOL(links)
            newDoc.url = url
            document = open(self.retriever.filename(url)).read()
            newDoc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies(
                unique_terms, document)
            self.db[key] = newDoc
            #print self.db[key].outgoingLinks
        #self.extractLinksfromResponse(url,links)
        self.db.close()

    def extractLinksfromResponse(self, url, links):
        """analyses the incoming links from the response
      """
        for link in links:
            key = md5(link).hexdigest()
            if key in self.db.iterkeys():
                doc = self.db[key]
                doc.insertIL(url)
            else:
                newDo = Document(link)
                newDo.insertIL(url)
                #print type(newDo)
                #print type(key)
                self.db[key] = newDo
class LinkAnalyzer(object):
    """creates abstract documents and feeds their attributes
   """

    def __init__(self):
        shelve("database1", "c")
        self.term_extractor = parser.ExtractTerms()
        self.retriever = Retriever()

    def analyze(self, url, links):
        """creates a document and sets its outgoing links
      """
        self.db = shelve("database1", "w")
        key = md5(url).hexdigest()
        # if the document is already in the database, just add its outgoing links

        if key in self.db.iterkeys():
            doc = self.db[key]
            doc.insertOL(links)
            doc.url = url
            document = open(self.retriever.filename(url)).read()
            doc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            doc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document)
            # print self.db[key].outgoingLinks
        # if there is no document for the url, create a document and add its outgoing links
        if key not in self.db.iterkeys():
            newDoc = Document(url)
            newDoc.insertOL(links)
            newDoc.url = url
            document = open(self.retriever.filename(url)).read()
            newDoc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document)
            self.db[key] = newDoc
            # print self.db[key].outgoingLinks
        # self.extractLinksfromResponse(url,links)
        self.db.close()

    def extractLinksfromResponse(self, url, links):
        """analyses the incoming links from the response
      """
        for link in links:
            key = md5(link).hexdigest()
            if key in self.db.iterkeys():
                doc = self.db[key]
                doc.insertIL(url)
            else:
                newDo = Document(link)
                newDo.insertIL(url)
                # print type(newDo)
                # print type(key)
                self.db[key] = newDo
Beispiel #3
0
class Downloader(object):
    """There are two downloaders
    download() uses the urllib2 module
    CDownload() uses curl to download the pages. This results in fast page downloads
    """
    def __init__(self):
        self.retriever = Retriever() #to use the filename function
        self.headers = {    
          'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
          'Accept' : 'text/xml,application/xml,application/xhtml+xml,\
            text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
          'Accept-Language' : 'fr-fr,en-us;q=0.7,en;q=0.3',
          'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
        }
    
    
    def download(self,url):
        """downloads the webpage indicated by url and saves it in a 
        file with an absolute path as that of the url
        """                  
        reqObj = urllib2.Request(url, None, self.headers)
        try:
            urlObj = urllib2.urlopen(reqObj)
            response = urlObj.readlines()
        except Exception:
            return 
        #write the content of the response object to the file
        file = open(self.retriever.filename(url), 'w')
        
        for line in response:
            file.writelines(line)
        print url + "**** crawled"
        
        logging.info("* crawled %s \n"%url)
        file.close()
            
        return 1         

    def CDownload(self, url):
        try:
            file_name = self.retriever.filename(url)
            #curl downloads the file and writes it into a file
            os.system("curl %s -o %s"%(url, file_name))
            print url + "**** crawled"
            logging.info("* crawled %s \n"%url)
            response=open(file_name, "r").read()
        except IOError:
            return 0
        
        return response