def NER(): trainFiles = util.findTrainingFiles() testFiles = util.findTestFiles() trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) testCorpus, usedTestFiles = util.findCorpus(testFiles) for text in trainCorpus: nerText = "Chelsea began their Premier League title defence with a thrilling 2-2 draw against Swansea, as Thibaut Courtois was sent off and will now miss the crunch clash with Manchester City" for sent in nltk.sent_tokenize(nerText): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'node'): print chunk.node, ' '.join(c[0] for c in chunk.leaves())
def ConnectionClarity(): todayDate = util.getYesterdayDateFolder() lastClarityDate = util.loadSettings(Constants.LAST_CLARITY_DIR) lastSuggClarityDate = util.loadSettings(Constants.LAST_SUGG_CLARITY_DIR) if lastClarityDate: util.logger.info("Google Clarity done last for =" + lastClarityDate) else: util.logger.info("Google Clarity done last for none") if lastSuggClarityDate: util.logger.info("Sugg Clarity done last for =" + lastClarityDate) else: util.logger.info("Sugg Clarity done last for none") if todayDate == lastClarityDate and todayDate == lastSuggClarityDate: util.logger.info("Clarity signal done for today =" + todayDate) return True trainFiles = util.findTrainingFiles() trainFiles = util.random_select(trainFiles) trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) normalClarity = True if todayDate != lastClarityDate: testFiles = util.findTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) clarityobj = Clarity(trainCorpus, testCorpus) clarityScore = clarityobj.ClarityScore() normalClarity = printNormalRankedDocs(clarityScore, usedTestFiles) if normalClarity == True: util.saveSettings(Constants.LAST_CLARITY_DIR, todayDate) util.logger.info("Google Clarity info just completed for =" + todayDate) suggClarity = True if todayDate != lastClarityDate: testFiles = util.findSuggTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) clarityobj = Clarity(trainCorpus, testCorpus) clarityScore = clarityobj.ClarityScore() suggClarity = printSuggRankedDocs(clarityScore, usedTestFiles) if suggClarity == True: util.saveSettings(Constants.LAST_SUGG_CLARITY_DIR, todayDate) util.logger.info("SuggGoogle Clarity info just completed for =" + todayDate) return normalClarity or suggClarity
def ConnectionClarity(): todayDate = util.getTodayDateFolder() lastClarityDate = util.loadSettings(Constants.LAST_CLARITY_DIR) if todayDate == lastClarityDate : util.logger.info("Clarity signal done for today =" + todayDate) return True trainFiles = util.findTrainingFiles() testFiles = util.findTestFiles() trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) testCorpus, usedTestFiles = util.findCorpus(testFiles) clarityobj = Clarity(trainCorpus,testCorpus) clarityScore = clarityobj.ClarityScore() ret = printRankedDocs(clarityScore, usedTestFiles) if ret == True: util.saveSettings(Constants.LAST_CLARITY_DIR, todayDate) util.logger.info("Clarity info just completed for ="+todayDate) return ret
def Relevance(): todayDate = util.getTodayDateFolder() lastRelevanceDate = util.loadSettings(Constants.LAST_RELEVANCE_DIR) if todayDate == lastRelevanceDate : util.logger.info("Relevance signal already done for today :" + todayDate) return True trainFiles = util.findTrainingFiles() testFiles = util.findTestFiles() trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) testCorpus, usedTestFiles = util.findCorpus(testFiles) all_tokens = sum(trainCorpus, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in trainCorpus] pass dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus=corpus, id2word=dictionary,normalize=True) index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(dictionary)) count = 0 testJson = {} for text in testCorpus: vec=dictionary.doc2bow(text) sims = index[tfidf[vec]] score = sum(sims) #print(list(enumerate(sims))) testJson[usedTestFiles[count]] = score count = count + 1 ret = printRankedDocs(testJson) if ret == True: util.saveSettings(Constants.LAST_RELEVANCE_DIR, todayDate) util.logger.info("Relevance info just completed for ="+todayDate) return ret
def Relevance(): todayDate = util.getYesterdayDateFolder() lastRelevanceDate = util.loadSettings(Constants.LAST_RELEVANCE_DIR) lastSuggRelevanceDate = util.loadSettings( Constants.LAST_SUGG_RELEVANCE_DIR) if lastRelevanceDate: util.logger.info("Google Relevance done last for =" + lastRelevanceDate) else: util.logger.info("Google Relevance done last for None") if lastSuggRelevanceDate: util.logger.info("Sugg Relevance done last for =" + lastRelevanceDate) else: util.logger.info("Sugg Relevance done last for None") if todayDate == lastRelevanceDate and todayDate == lastSuggRelevanceDate: util.logger.info("Relevance signal already done for today :" + todayDate) return True trainFiles = util.findTrainingFiles() trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) all_tokens = sum(trainCorpus, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in trainCorpus] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus=corpus, id2word=dictionary, normalize=True) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary)) normalRelevance = True if todayDate != lastRelevanceDate: testFiles = util.findTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) count = 0 testJson = {} for text in testCorpus: vec = dictionary.doc2bow(text) sims = index[tfidf[vec]] score = sum(sims) #print(list(enumerate(sims))) testJson[usedTestFiles[count]] = score count = count + 1 normalRelevance = printNormalRankedDocs(testJson) if normalRelevance == True: util.saveSettings(Constants.LAST_RELEVANCE_DIR, todayDate) util.logger.info("Google Relevance info just completed for =" + todayDate) suggRelevance = True if todayDate != lastSuggRelevanceDate: testFiles = util.findSuggTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) count = 0 testJson = {} for text in testCorpus: vec = dictionary.doc2bow(text) sims = index[tfidf[vec]] score = sum(sims) #print(list(enumerate(sims))) testJson[usedTestFiles[count]] = score count = count + 1 suggRelevance = printSuggRankedDocs(testJson) if suggRelevance == True: util.saveSettings(Constants.LAST_SUGG_RELEVANCE_DIR, todayDate) util.logger.info("Google Relevance info just completed for =" + todayDate) return normalRelevance or suggRelevance
def Smoothness(): todayDate = util.getYesterdayDateFolder() lastSmoothnessDate = util.loadSettings(Constants.LAST_SMOOTHNESS_DIR) lastSuggSmoothnessDate = util.loadSettings( Constants.LAST_SUGG_SMOOTHNESS_DIR) if lastSmoothnessDate: util.logger.info("Google Smoothness done last for =" + lastSmoothnessDate) else: util.logger.info("Google Smoothness done last for none") if lastSuggSmoothnessDate: util.logger.info("Sugg Google Smoothness done last for =" + lastSuggSmoothnessDate) else: util.logger.info("Sugg Google Smoothness done last for none") if todayDate == lastSmoothnessDate and todayDate == lastSuggSmoothnessDate: util.logger.info("Smoothness signal done for today" + todayDate) return True trainFiles = util.findTrainingFiles() trainFiles = util.random_select(trainFiles) trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) bm25obj = Bm25(trainCorpus) trainUniqueWords = [] for trainText in trainCorpus: trainUniqueWords.append(set(trainText)) normalSmoothness = True if todayDate != lastSmoothnessDate: testFiles = util.findTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) testJson = {} testUniqueWords = [] smoothness = zeros((len(testCorpus), len(trainCorpus))) for testText in testCorpus: testUniqueWords.append(set(testText)) for testDoc in range(len(testCorpus)): uniqueTest = testUniqueWords[testDoc] SminusDcontext = zeros(bm25obj.N) DminusScontext = zeros(bm25obj.N) for trainDoc in range(len(trainCorpus)): uniqueTrain = trainUniqueWords[trainDoc] # t0 = time() SminusD = [ word for word in trainCorpus[trainDoc] if word not in uniqueTest ] # t1 = time() #print "time 1 = "+str(t1-t0) DminusS = [ word for word in testCorpus[testDoc] if word not in uniqueTrain ] # t2 = time() # print "time 2 = "+str(t2-t1) SminusDcontext = bm25obj.BM25Score(SminusD) # t3 = time() # print "time 3 = "+str(t3-t2) DminusScontext = bm25obj.BM25Score(DminusS) # t4 = time() # print "time 4 = "+str(t4-t3) smoothness[testDoc][trainDoc] = np.dot(SminusDcontext, DminusScontext) # t5 = time() #print "time 5 = "+str(t5-t4) normalSmoothness = printNormalRankedDocs(smoothness, usedTestFiles) if normalSmoothness == True: util.saveSettings(Constants.LAST_SMOOTHNESS_DIR, todayDate) util.logger.info("Google Smoothness info just completed for =" + todayDate) suggSmoothness = True if todayDate != lastSuggSmoothnessDate: testFiles = util.findSuggTestFiles() testCorpus, usedTestFiles = util.findCorpus(testFiles) testJson = {} testUniqueWords = [] smoothness = zeros((len(testCorpus), len(trainCorpus))) for testText in testCorpus: testUniqueWords.append(set(testText)) for testDoc in range(len(testCorpus)): uniqueTest = testUniqueWords[testDoc] SminusDcontext = zeros(bm25obj.N) DminusScontext = zeros(bm25obj.N) for trainDoc in range(len(trainCorpus)): uniqueTrain = trainUniqueWords[trainDoc] SminusD = [ word for word in trainCorpus[trainDoc] if word not in uniqueTest ] DminusS = [ word for word in testCorpus[testDoc] if word not in uniqueTrain ] SminusDcontext = bm25obj.BM25Score(SminusD) DminusScontext = bm25obj.BM25Score(DminusS) smoothness[testDoc][trainDoc] = np.dot(SminusDcontext, DminusScontext) suggSmoothness = printSuggRankedDocs(smoothness, usedTestFiles) if suggSmoothness == True: util.saveSettings(Constants.LAST_SUGG_SMOOTHNESS_DIR, todayDate) util.logger.info("Sugg Smoothness info just completed for =" + todayDate) return normalSmoothness or suggSmoothness