def storeNewLinkInMERGEandHTML(self, file_id, rssResource, page, title, firstPage_link, link, page_num, date): try: file_name = hashlib.md5(title.encode('utf-8')).hexdigest() except: pass # print "file name = "+file_name try: myFile = open(self.sub_ppath + str(file_id)+'.'+file_name+'.html', 'w') myFile.write(page) language = 'English' sourcename = self.RSSName if self.RSSName in self.config['multisource']: sourcename,language = SpecialSites.getNameAndLanguageFromResource(rssResource,sourcename,language) data = { 'source':rssResource, 'language': language, 'sourcename':sourcename, 'firstPage_link':firstPage_link, 'page': page_num, 'title': title, 'timestamp-sec': date} content = json.dumps(data) new_line_webpage = str(file_id)+'.'+file_name+'.html'+'\t' + 'html'+'\t'+ str(file_id)+'\t' + link + '\t' + link + '\t' + content + '\t' + '0' + '\t' + '-1' myFile2 = open(self.sub_ppath + self.config['storagefile'], 'a') myFile2.write(new_line_webpage) myFile2.write('\n') except UnicodeEncodeError: logging.warning("there is a UnicodeEncodeError") return 'unicode error' finally: myFile.close() myFile2.close() return None
def fetchNews(self): for rssResource in self.sources: # print "rssResource = "+rssResource new_links = [] fileName = self.replaceAll4FileName(rssResource) self.createAllFetchedLinks(fileName) d = self.fetchXML(rssResource) if d == "wrong url": continue # fetch all items from a RSS source for dd in d.entries: ex = NewsBlogExtractor() page_num = 1 page, link, file_id= self.fetchWebpage(dd.link) firstPage_link = link while link != None: time.sleep(0.1) # special processing for some RSS sources if self.RSSName in self.config['specialsites']: # page, link = globals()["SpecialSites." + self.RSSName](page, link) if self.RSSName == 'newyorktimes': page, link = SpecialSites.newyorktimes(page, link) elif self.RSSName == 'straitstimes': page, link = SpecialSites.straitstimes(page, link) if (page == None) or (len(page) == 0): break # next item if self.determineDuplication(fileName,link) == 'False': # insert the new link into database self.updateNewLinks(fileName, link) # store a new link in a special file , such as, MERGE.TXT, and store its whole webpage in a HTML file self.storeNewLinkInMERGEandHTML(file_id, rssResource, page, dd.title, firstPage_link, link, page_num, str(time.mktime(dd.published_parsed)+self.config['timezonedifference']*3600)) # special processing for special sites language = 'English' sourcename = self.RSSName if self.RSSName in self.config['multisource']: sourcename,language = SpecialSites.getNameAndLanguageFromResource(rssResource,sourcename,language) # store all images in the webpage if rssinstance.config['imagestorage'] == "True": o = urlparse(link) images = ex.findAllImages(page, o.netloc, sourcename) # print images self.fetchAllImages(images, link) # calculate word frequency in title if self.config['wordsFrequency'] == "True": self.calWordsFrequency(dd.title, str(dd.published_parsed[2])) new_links.append(link) # process multiple pages link = ex.findNextPage(page, sourcename) if link != None: page_num +=1 page, link,file_id = self.fetchWebpage(link) else: break if (len(new_links) !=0) : # update words frequency record file if self.config['wordsFrequency'] == "True": self.updateWordsFrequency() time.sleep(1) return None