def downloadGoogleNews(downloadDate): result = False read_directory = os.path.join(Constants.ROOT_FOLDER, Constants.RECOMMENDATION_DIR, Constants.GOOGLE_LINKS_DIR, downloadDate, Constants.GOOGLE_LINKS_FILE) write_directory = os.path.join(Constants.ROOT_FOLDER, Constants.RECOMMENDATION_DIR, Constants.GOOGLE_NEWS_DIR, downloadDate) if not os.path.exists(write_directory): os.makedirs(write_directory) writeJson = {} try: hyperlinks = [line.strip() for line in open(read_directory)] count = 0 for link in hyperlinks: url = link.replace("http://", "") url = url.replace("www.", "") parsedUrl = re.sub(r'\W+', '', url) if len(parsedUrl) > 25: parsedUrl = parsedUrl[:25] try: html_filename = os.path.join(write_directory, parsedUrl) if os.path.isfile(html_filename) is False: htmlfile = urllib2.urlopen(link) html = htmlfile.read() ret = util.writeToFile(html, html_filename) if ret == True: linkDict = {} linkDict["url"] = link linkDict["content"] = "" soup = BeautifulSoup(html, 'html.parser') if soup.title and soup.title.contents[0]: title = soup.title.contents[0] else: title = "" linkDict["title"] = title writeJson[parsedUrl] = linkDict count = count + 1 print 'downloaded link =' + url except Exception, e: util.logger.error("Exception at downloading link : %s" % url) if count > Constants.MIN_GOOGLELINKS_DAILY: result = writeUrlJson(writeJson, downloadDate) if result == True: util.saveSettings(Constants.LAST_GOOGLENEWS_DOWNLOAD, downloadDate) util.logger.info("Google news downloaded for =" + downloadDate + " links=" + str(count)) else: util.logger.error("Google news failed to download for =" + downloadDate + " links=" + str(count))
def RecommendationMetric(): todayDateFolder = util.getTodayDateFolder() lastRecommended= util.loadSettings(Constants.LAST_RECOMMENDATION_DONE) if todayDateFolder == lastRecommended: return True try: relevance_json = {} #Get Relevance json readRelevanceDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.RELEVANCE_FILE) if os.path.isfile(readRelevanceDir) is True: with open(readRelevanceDir) as json_data: json_text = json_data.read() relevance_json = json.loads(json_text) json_data.close() #Get Smoothness json smoothness_json = {} readSmoothnessDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.SMOOTHNESS_FILE) if os.path.isfile(readSmoothnessDir) is True: with open(readSmoothnessDir) as json_data: json_text = json_data.read() smoothness_json = json.loads(json_text) json_data.close() #Get Clarity json clarity_json = {} readClarityDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.CLARITY_FILE) if os.path.isfile(readClarityDir) is True: with open(readClarityDir) as json_data: json_text = json_data.read() clarity_json = json.loads(json_text) json_data.close() #Lets not apply any linear weight formula for now cou = Counter() cou.update(relevance_json) cou.update(smoothness_json) cou.update(clarity_json) #Convert above back to a dictionary final_json = dict(cou) result = printRecommendedDocs(final_json, todayDateFolder) if result == True: util.saveSettings(Constants.LAST_RECOMMENDATION_DONE, todayDateFolder) util.logger.info("Recommended links done for ="+ todayDateFolder) pass except Exception, e: util.logger.error( "Exception at recommending links for : %s Exception = %s" % (todayDateFolder,traceback.print_exc))
def RecommendationMetric(): todayDateFolder = util.getTodayDateFolder() lastRecommended= util.loadSettings(Constants.LAST_RECOMMENDATION_DONE) if todayDateFolder == lastRecommended: return True try: relevance_json = {} #Get Relevance json readRelevanceDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.GOOGLENEWS,Constants.RELEVANCE_FILE) if os.path.isfile(readRelevanceDir) is True: with open(readRelevanceDir) as json_data: json_text = json_data.read() relevance_json = json.loads(json_text) json_data.close() #Get Smoothness json smoothness_json = {} readSmoothnessDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.GOOGLENEWS,Constants.SMOOTHNESS_FILE) if os.path.isfile(readSmoothnessDir) is True: with open(readSmoothnessDir) as json_data: json_text = json_data.read() smoothness_json = json.loads(json_text) json_data.close() #Get Clarity json clarity_json = {} readClarityDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.GOOGLENEWS,Constants.CLARITY_FILE) if os.path.isfile(readClarityDir) is True: with open(readClarityDir) as json_data: json_text = json_data.read() clarity_json = json.loads(json_text) json_data.close() #Lets not apply any linear weight formula for now cou = Counter() cou.update(relevance_json) cou.update(smoothness_json) cou.update(clarity_json) #Convert above back to a dictionary final_json = dict(cou) result = printRecommendedDocs(final_json, todayDateFolder) if result == True: util.saveSettings(Constants.LAST_RECOMMENDATION_DONE, todayDateFolder) util.logger.info("Recommended Google links done for ="+ todayDateFolder) pass except Exception, e: util.logger.error( "Exception at recommending google links for : %s Exception = %s" % (todayDateFolder,traceback.print_exc))
def downloadGoogleNews(downloadDate): result = False read_directory = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.GOOGLE_LINKS_DIR,downloadDate, Constants.GOOGLE_LINKS_FILE) write_directory = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.GOOGLE_NEWS_DIR,downloadDate) if not os.path.exists(write_directory): os.makedirs(write_directory) writeJson = {} try: hyperlinks = [line.strip() for line in open(read_directory)] count = 0 for link in hyperlinks: url=link.replace("http://", "") url=url.replace("www.", "") parsedUrl=re.sub(r'\W+', '', url) if len(parsedUrl) > 25: parsedUrl=parsedUrl[:25] try: html_filename = os.path.join(write_directory,parsedUrl) if os.path.isfile(html_filename) is False: htmlfile = urllib2.urlopen(link) html = htmlfile.read() ret = util.writeToFile(html,html_filename) if ret == True: linkDict = {} linkDict["url"] = link linkDict["content"] = "" soup = BeautifulSoup(html, 'html.parser') if soup.title and soup.title.contents[0]: title = soup.title.contents[0] else: title = "" linkDict["title"] = title writeJson[parsedUrl] = linkDict count=count+1 print 'downloaded link ='+url except Exception, e: util.logger.error("Exception at downloading link : %s" % url) if count > Constants.MIN_GOOGLELINKS_DAILY: result = writeUrlJson(writeJson, downloadDate ) if result == True: util.saveSettings(Constants.LAST_GOOGLENEWS_DOWNLOAD, downloadDate) util.logger.info("Google news downloaded for ="+downloadDate+" links="+str(count)) else: util.logger.error("Google news failed to download for ="+downloadDate+" links="+str(count)) except Exception, e: print "Exception at open Google news links for download: %s" % read_directory
def ConnectionClarity(): todayDate = util.getYesterdayDateFolder() lastClarityDate = util.loadSettings(Constants.LAST_CLARITY_DIR) lastSuggClarityDate = util.loadSettings(Constants.LAST_SUGG_CLARITY_DIR) if lastClarityDate: util.logger.info("Google Clarity done last for =" + lastClarityDate) else: util.logger.info("Google Clarity done last for none") if lastSuggClarityDate: util.logger.info("Sugg Clarity done last for =" + lastClarityDate) else: util.logger.info("Sugg Clarity done last for none") if todayDate == lastClarityDate and todayDate == lastSuggClarityDate: util.logger.info("Clarity signal done for today =" + todayDate) return True trainFiles = util.findTrainingFiles() trainFiles = util.random_select(trainFiles) trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) normalClarity = True if todayDate != lastClarityDate: testFiles = util.findTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) clarityobj = Clarity(trainCorpus, testCorpus) clarityScore = clarityobj.ClarityScore() normalClarity = printNormalRankedDocs(clarityScore, usedTestFiles) if normalClarity == True: util.saveSettings(Constants.LAST_CLARITY_DIR, todayDate) util.logger.info("Google Clarity info just completed for =" + todayDate) suggClarity = True if todayDate != lastClarityDate: testFiles = util.findSuggTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) clarityobj = Clarity(trainCorpus, testCorpus) clarityScore = clarityobj.ClarityScore() suggClarity = printSuggRankedDocs(clarityScore, usedTestFiles) if suggClarity == True: util.saveSettings(Constants.LAST_SUGG_CLARITY_DIR, todayDate) util.logger.info("SuggGoogle Clarity info just completed for =" + todayDate) return normalClarity or suggClarity
def ConnectionClarity(): todayDate = util.getTodayDateFolder() lastClarityDate = util.loadSettings(Constants.LAST_CLARITY_DIR) if todayDate == lastClarityDate : util.logger.info("Clarity signal done for today =" + todayDate) return True trainFiles = util.findTrainingFiles() testFiles = util.findTestFiles() trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) testCorpus, usedTestFiles = util.findCorpus(testFiles) clarityobj = Clarity(trainCorpus,testCorpus) clarityScore = clarityobj.ClarityScore() ret = printRankedDocs(clarityScore, usedTestFiles) if ret == True: util.saveSettings(Constants.LAST_CLARITY_DIR, todayDate) util.logger.info("Clarity info just completed for ="+todayDate) return ret
def Relevance(): todayDate = util.getTodayDateFolder() lastRelevanceDate = util.loadSettings(Constants.LAST_RELEVANCE_DIR) if todayDate == lastRelevanceDate : util.logger.info("Relevance signal already done for today :" + todayDate) return True trainFiles = util.findTrainingFiles() testFiles = util.findTestFiles() trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) testCorpus, usedTestFiles = util.findCorpus(testFiles) all_tokens = sum(trainCorpus, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in trainCorpus] pass dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus=corpus, id2word=dictionary,normalize=True) index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(dictionary)) count = 0 testJson = {} for text in testCorpus: vec=dictionary.doc2bow(text) sims = index[tfidf[vec]] score = sum(sims) #print(list(enumerate(sims))) testJson[usedTestFiles[count]] = score count = count + 1 ret = printRankedDocs(testJson) if ret == True: util.saveSettings(Constants.LAST_RELEVANCE_DIR, todayDate) util.logger.info("Relevance info just completed for ="+todayDate) return ret
def Relevance(): todayDate = util.getYesterdayDateFolder() lastRelevanceDate = util.loadSettings(Constants.LAST_RELEVANCE_DIR) lastSuggRelevanceDate = util.loadSettings( Constants.LAST_SUGG_RELEVANCE_DIR) if lastRelevanceDate: util.logger.info("Google Relevance done last for =" + lastRelevanceDate) else: util.logger.info("Google Relevance done last for None") if lastSuggRelevanceDate: util.logger.info("Sugg Relevance done last for =" + lastRelevanceDate) else: util.logger.info("Sugg Relevance done last for None") if todayDate == lastRelevanceDate and todayDate == lastSuggRelevanceDate: util.logger.info("Relevance signal already done for today :" + todayDate) return True trainFiles = util.findTrainingFiles() trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) all_tokens = sum(trainCorpus, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in trainCorpus] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus=corpus, id2word=dictionary, normalize=True) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary)) normalRelevance = True if todayDate != lastRelevanceDate: testFiles = util.findTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) count = 0 testJson = {} for text in testCorpus: vec = dictionary.doc2bow(text) sims = index[tfidf[vec]] score = sum(sims) #print(list(enumerate(sims))) testJson[usedTestFiles[count]] = score count = count + 1 normalRelevance = printNormalRankedDocs(testJson) if normalRelevance == True: util.saveSettings(Constants.LAST_RELEVANCE_DIR, todayDate) util.logger.info("Google Relevance info just completed for =" + todayDate) suggRelevance = True if todayDate != lastSuggRelevanceDate: testFiles = util.findSuggTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) count = 0 testJson = {} for text in testCorpus: vec = dictionary.doc2bow(text) sims = index[tfidf[vec]] score = sum(sims) #print(list(enumerate(sims))) testJson[usedTestFiles[count]] = score count = count + 1 suggRelevance = printSuggRankedDocs(testJson) if suggRelevance == True: util.saveSettings(Constants.LAST_SUGG_RELEVANCE_DIR, todayDate) util.logger.info("Google Relevance info just completed for =" + todayDate) return normalRelevance or suggRelevance
def Smoothness(): todayDate = util.getYesterdayDateFolder() lastSmoothnessDate = util.loadSettings(Constants.LAST_SMOOTHNESS_DIR) lastSuggSmoothnessDate = util.loadSettings( Constants.LAST_SUGG_SMOOTHNESS_DIR) if lastSmoothnessDate: util.logger.info("Google Smoothness done last for =" + lastSmoothnessDate) else: util.logger.info("Google Smoothness done last for none") if lastSuggSmoothnessDate: util.logger.info("Sugg Google Smoothness done last for =" + lastSuggSmoothnessDate) else: util.logger.info("Sugg Google Smoothness done last for none") if todayDate == lastSmoothnessDate and todayDate == lastSuggSmoothnessDate: util.logger.info("Smoothness signal done for today" + todayDate) return True trainFiles = util.findTrainingFiles() trainFiles = util.random_select(trainFiles) trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) bm25obj = Bm25(trainCorpus) trainUniqueWords = [] for trainText in trainCorpus: trainUniqueWords.append(set(trainText)) normalSmoothness = True if todayDate != lastSmoothnessDate: testFiles = util.findTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) testJson = {} testUniqueWords = [] smoothness = zeros((len(testCorpus), len(trainCorpus))) for testText in testCorpus: testUniqueWords.append(set(testText)) for testDoc in range(len(testCorpus)): uniqueTest = testUniqueWords[testDoc] SminusDcontext = zeros(bm25obj.N) DminusScontext = zeros(bm25obj.N) for trainDoc in range(len(trainCorpus)): uniqueTrain = trainUniqueWords[trainDoc] # t0 = time() SminusD = [ word for word in trainCorpus[trainDoc] if word not in uniqueTest ] # t1 = time() #print "time 1 = "+str(t1-t0) DminusS = [ word for word in testCorpus[testDoc] if word not in uniqueTrain ] # t2 = time() # print "time 2 = "+str(t2-t1) SminusDcontext = bm25obj.BM25Score(SminusD) # t3 = time() # print "time 3 = "+str(t3-t2) DminusScontext = bm25obj.BM25Score(DminusS) # t4 = time() # print "time 4 = "+str(t4-t3) smoothness[testDoc][trainDoc] = np.dot(SminusDcontext, DminusScontext) # t5 = time() #print "time 5 = "+str(t5-t4) normalSmoothness = printNormalRankedDocs(smoothness, usedTestFiles) if normalSmoothness == True: util.saveSettings(Constants.LAST_SMOOTHNESS_DIR, todayDate) util.logger.info("Google Smoothness info just completed for =" + todayDate) suggSmoothness = True if todayDate != lastSuggSmoothnessDate: testFiles = util.findSuggTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) testJson = {} testUniqueWords = [] smoothness = zeros((len(testCorpus), len(trainCorpus))) for testText in testCorpus: testUniqueWords.append(set(testText)) for testDoc in range(len(testCorpus)): uniqueTest = testUniqueWords[testDoc] SminusDcontext = zeros(bm25obj.N) DminusScontext = zeros(bm25obj.N) for trainDoc in range(len(trainCorpus)): uniqueTrain = trainUniqueWords[trainDoc] SminusD = [ word for word in trainCorpus[trainDoc] if word not in uniqueTest ] DminusS = [ word for word in testCorpus[testDoc] if word not in uniqueTrain ] SminusDcontext = bm25obj.BM25Score(SminusD) DminusScontext = bm25obj.BM25Score(DminusS) smoothness[testDoc][trainDoc] = np.dot(SminusDcontext, DminusScontext) suggSmoothness = printSuggRankedDocs(smoothness, usedTestFiles) if suggSmoothness == True: util.saveSettings(Constants.LAST_SUGG_SMOOTHNESS_DIR, todayDate) util.logger.info("Sugg Smoothness info just completed for =" + todayDate) return normalSmoothness or suggSmoothness
else: result = True if result == True: count = count + 1 util.logger.info('Boilered done for sugg_news =' + html_filename + str(count)) except Exception, e: util.logger.error("Exception at boiler for google news : %s" % read_directory) else: pass finalJson['suggestGoogle'][Constants.GOOGLE].append(linkObj) result = writeBoilerJson(finalJson, downloadDate) if result == True: util.saveSettings(Constants.LAST_BOILER_SUGGGOOGLENEWS, downloadDate) util.logger.info("Sugg Google news boilered for =" + downloadDate + " links=" + str(count) + "total =" + str(len(googleLinks))) else: util.logger.error("Sugg Google news failed to boilered for =" + downloadDate + " links=" + str(count)) return result def BoilerNews(downloadDate): jsonData = readBoilerJson(downloadDate) if jsonData is None: return False result = False read_directory = os.path.join(Constants.ROOT_FOLDER,
if htmlFile in jsonData: jsonData[htmlFile]["content"] = htmlText else: result = True if result == True: count = count + 1 else: if htmlFile in jsonData: del jsonData[htmlFile] print 'Boilered done for ='+html_filename+str(count) except Exception, e: util.logger.error( "Exception at boiler for google news : %s" % read_directory) if ((count*100)/len(onlyfiles)) > Constants.MIN_GOOGLELINKS_DAILY: result = writeBoilerJson(jsonData, downloadDate ) if result == True: util.saveSettings(Constants.LAST_BOILER_GOOGLENEWS, downloadDate) util.logger.info("Google news boilered for ="+downloadDate+" links="+str(count)) else: util.logger.error("Google news failed to boilered for ="+downloadDate+" links="+str(count)) return result def BoilerData(downloadDate): ret = False read_directory = os.path.join(Constants.ROOT_FOLDER,Constants.DATA_DIR,downloadDate) write_directory = os.path.join(Constants.ROOT_FOLDER,Constants.BOILER_DATA_DIR,downloadDate) if not os.path.exists(read_directory): util.logger.error("Boilers data can't be run because folder isn't present = "+downloadDate) return ret if not os.path.exists(write_directory): os.makedirs(write_directory)
browser.get('http://www.news.google.com') links = browser.find_elements_by_xpath('//a') getLinksPerCategory(links, 'HomePage') if len(downloadedLinks) > Constants.MIN_GOOGLELINKS_DAILY: linksToBeWritten = "\n".join(downloadedLinks) directory = os.path.join(Constants.ROOT_FOLDER, Constants.RECOMMENDATION_DIR, Constants.GOOGLE_LINKS_DIR, todayDate) if not os.path.exists(directory): os.makedirs(directory) result = util.writeToFile( linksToBeWritten, os.path.join(directory, Constants.GOOGLE_LINKS_FILE)) if result == True: util.saveSettings(Constants.LAST_GOOGLELINKS_DOWNLOAD, todayDate) util.logger.info("Google links downloaded for =" + todayDate) return result util.logger.error("Google links not downloaded for =" + todayDate) return result def GoogleNews(): downloadedLinks = [] todayDate = util.getTodayDateFolder() lastNewsDownloaded = util.loadSettings(Constants.LAST_GOOGLENEWS_DOWNLOAD) lastLinksDownloaded = util.loadSettings( Constants.LAST_GOOGLELINKS_DOWNLOAD) googleLinksStatus = True
# Get all the main page links. #Initialize for it browser.get('http://www.news.google.com') links = browser.find_elements_by_xpath('//a') getLinksPerCategory(links, 'HomePage') if len(downloadedLinks) > Constants.MIN_GOOGLELINKS_DAILY: linksToBeWritten = "\n".join(downloadedLinks) directory = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.GOOGLE_LINKS_DIR,todayDate) if not os.path.exists(directory): os.makedirs(directory) result = util.writeToFile(linksToBeWritten,os.path.join(directory,Constants.GOOGLE_LINKS_FILE)) if result == True: util.saveSettings(Constants.LAST_GOOGLELINKS_DOWNLOAD, todayDate) util.logger.info("Google links downloaded for ="+todayDate) return result util.logger.error("Google links not downloaded for ="+todayDate) return result def GoogleNews(): downloadedLinks = [] todayDate = util.getTodayDateFolder() lastNewsDownloaded = util.loadSettings(Constants.LAST_GOOGLENEWS_DOWNLOAD) lastLinksDownloaded = util.loadSettings(Constants.LAST_GOOGLELINKS_DOWNLOAD) googleLinksStatus = True googleNewsStatus = True #Check whether today links have been extracted or not