class LinkAnalyzer(object): """creates abstract documents and feeds their attributes """ def __init__(self): shelve('database1', 'c') self.term_extractor = parser.ExtractTerms() self.retriever = Retriever() def analyze(self, url, links): """creates a document and sets its outgoing links """ self.db = shelve('database1', 'w') key = md5(url).hexdigest() #if the document is already in the database, just add its outgoing links if key in self.db.iterkeys(): doc = self.db[key] doc.insertOL(links) doc.url = url document = open(self.retriever.filename(url)).read() doc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) doc.unique_terms_freq = self.term_extractor.count_term_frequencies( unique_terms, document) #print self.db[key].outgoingLinks #if there is no document for the url, create a document and add its outgoing links if key not in self.db.iterkeys(): newDoc = Document(url) newDoc.insertOL(links) newDoc.url = url document = open(self.retriever.filename(url)).read() newDoc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies( unique_terms, document) self.db[key] = newDoc #print self.db[key].outgoingLinks #self.extractLinksfromResponse(url,links) self.db.close() def extractLinksfromResponse(self, url, links): """analyses the incoming links from the response """ for link in links: key = md5(link).hexdigest() if key in self.db.iterkeys(): doc = self.db[key] doc.insertIL(url) else: newDo = Document(link) newDo.insertIL(url) #print type(newDo) #print type(key) self.db[key] = newDo
class LinkAnalyzer(object): """creates abstract documents and feeds their attributes """ def __init__(self): shelve("database1", "c") self.term_extractor = parser.ExtractTerms() self.retriever = Retriever() def analyze(self, url, links): """creates a document and sets its outgoing links """ self.db = shelve("database1", "w") key = md5(url).hexdigest() # if the document is already in the database, just add its outgoing links if key in self.db.iterkeys(): doc = self.db[key] doc.insertOL(links) doc.url = url document = open(self.retriever.filename(url)).read() doc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) doc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document) # print self.db[key].outgoingLinks # if there is no document for the url, create a document and add its outgoing links if key not in self.db.iterkeys(): newDoc = Document(url) newDoc.insertOL(links) newDoc.url = url document = open(self.retriever.filename(url)).read() newDoc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document) self.db[key] = newDoc # print self.db[key].outgoingLinks # self.extractLinksfromResponse(url,links) self.db.close() def extractLinksfromResponse(self, url, links): """analyses the incoming links from the response """ for link in links: key = md5(link).hexdigest() if key in self.db.iterkeys(): doc = self.db[key] doc.insertIL(url) else: newDo = Document(link) newDo.insertIL(url) # print type(newDo) # print type(key) self.db[key] = newDo
class Downloader(object): """There are two downloaders download() uses the urllib2 module CDownload() uses curl to download the pages. This results in fast page downloads """ def __init__(self): self.retriever = Retriever() #to use the filename function self.headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Accept' : 'text/xml,application/xml,application/xhtml+xml,\ text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language' : 'fr-fr,en-us;q=0.7,en;q=0.3', 'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.7' } def download(self,url): """downloads the webpage indicated by url and saves it in a file with an absolute path as that of the url """ reqObj = urllib2.Request(url, None, self.headers) try: urlObj = urllib2.urlopen(reqObj) response = urlObj.readlines() except Exception: return #write the content of the response object to the file file = open(self.retriever.filename(url), 'w') for line in response: file.writelines(line) print url + "**** crawled" logging.info("* crawled %s \n"%url) file.close() return 1 def CDownload(self, url): try: file_name = self.retriever.filename(url) #curl downloads the file and writes it into a file os.system("curl %s -o %s"%(url, file_name)) print url + "**** crawled" logging.info("* crawled %s \n"%url) response=open(file_name, "r").read() except IOError: return 0 return response