def runReportSimilarity(fileName, threshold=0.9, reportType="lsi"): """ Assumes reports have FINDINGS: or REPORT: """ fileText = [row.rstrip("\n") for row in open(fileName)] wordsToFind = ["FINDINGS:", "REPORT:"] report1 = fileText[0] report2 = fileText[1] startLoc1 = -1 # startLoc2 = -1 for word in wordsToFind: if startLoc1 == -1 and report1.find(word) != -1: startLoc1 = report1.find(word) + len(word) # if startLoc2 == -1 and report2.find(word) != -1: # startLoc2 = report2.find(word)+len(word) sCom = [] report1 = report1[startLoc1:] sentences1 = rnn.splitIntoSentences(report1) sentences2 = rnn.splitIntoSentences(report2) # sentences1 = rnn.textPreprocess(report1) # sentences2 = rnn.textPreprocess(report2) # sentences1 = report1.split('.') # sentences2 = report2.split('.') sent1 = sentences1[:] sent2 = sentences2[:] if reportType == "lsi": # report2 = report2[startLoc2:] report1 = search_engine.textPreprocess(report1) report1 = search_engine.getDerivations(report1) report2 = search_engine.textPreprocess(report2) report2 = search_engine.getDerivations(report2) for i in range(len(sentences1)): sentences1[i] = search_engine.textPreprocess(sentences1[i]) sentences1[i] = search_engine.getDerivations(sentences1[i]) for i in range(len(sentences2)): sentences2[i] = search_engine.textPreprocess(sentences2[i]) sentences2[i] = search_engine.getDerivations(sentences2[i]) # corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') tfidf_model = gensim.models.TfidfModel.load("./model_files/reports.tfidf_model") lsi_model = gensim.models.LsiModel.load("./model_files/reports.lsi_model") dictionary = gensim.corpora.Dictionary.load("./model_files/reports.dict") vec_lsi1 = lsi_model[tfidf_model[dictionary.doc2bow(report1)]] vec_lsi2 = lsi_model[tfidf_model[dictionary.doc2bow(report2)]] sen1Corp = [dictionary.doc2bow(sent) for sent in sentences1] sen2Corp = [dictionary.doc2bow(sent) for sent in sentences2] vec_lsis1 = lsi_model[tfidf_model[sen1Corp]] vec_lsis2 = lsi_model[tfidf_model[sen2Corp]] # print corpus.num_terms # ind = gensim.similarities.MatrixSimilarity(vec_lsis1,num_features=corpus.num_terms) ind = gensim.similarities.MatrixSimilarity(vec_lsis1, num_features=10) # similarity table for i in vec_lsis2: sCom.append(ind[i]) elif reportType == "rnn": sCom = rnn.compareReportSentences(report1, report2) sCom2 = [] for i in range(len(sCom[0])): row = [] for j in range(len(sCom)): row.append(sCom[j][i]) sCom2.append(row) sCom = sCom2 missing = [0 for s in sent1] # obtain correct sentence i = 0 output = {"missing": 0, "corrections": 0, "extras": 0, "correct": 0} for col in sCom: # for col in range(len(sCom[0])) # for col in sent2: aboveTopThreshold = False j = 0 bestSim = 0 for sim in col: if sim > threshold: aboveTopThreshold = True if sim > bestSim: bestSim = sim if missing[j] < sim: missing[j] = sim j += 1 if aboveTopThreshold: # maybe add percentage for debugging # sent2[i] = " ".join([k for k in sent2[i]]) s = "n\t" + sent2[i] + "\t" output["correct"] += 1 print s else: # sent2[i] = " ".join([k for k in sent2[i]]) s = "e\t" + sent2[i] + "\t" output["extras"] += 1 print s i += 1 i = 0 for k in missing: if k <= threshold: # sent1[i] = " ".join([k for k in sent1[i]]) # s = str(k) s = "m\t" + sent1[i] + "\t" output["missing"] += 1 print s i += 1 return output """
def runReportSimilarity2(fileName, threshold=0.9): """ Assumes reports have FINDINGS: or REPORT: """ fileText = [row.rstrip("\n") for row in open(fileName)] wordsToFind = ["FINDINGS:", "REPORT:"] report1 = fileText[0] report2 = fileText[1] startLoc1 = -1 # startLoc2 = -1 for word in wordsToFind: if startLoc1 == -1 and report1.find(word) != -1: startLoc1 = report1.find(word) + len(word) # if startLoc2 == -1 and report2.find(word) != -1: # startLoc2 = report2.find(word)+len(word) sCom = [] report1 = report1[startLoc1:] # report2 = report2[startLoc2:] sentences1 = report1.split(".") sent1 = sentences1[:] sentences2 = report2.split(".") sent2 = sentences2[:] report1 = search_engine.textPreprocess(report1) report1 = search_engine.getDerivations(report1) report2 = search_engine.textPreprocess(report2) report2 = search_engine.getDerivations(report2) for i in range(len(sentences1)): sentences1[i] = search_engine.textPreprocess(sentences1[i]) sentences1[i] = search_engine.getDerivations(sentences1[i]) for i in range(len(sentences2)): sentences2[i] = search_engine.textPreprocess(sentences2[i]) sentences2[i] = search_engine.getDerivations(sentences2[i]) # corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') tfidf_model = gensim.models.TfidfModel.load("./model_files/reports.tfidf_model") lsi_model = gensim.models.LsiModel.load("./model_files/reports.lsi_model") dictionary = gensim.corpora.Dictionary.load("./model_files/reports.dict") vec_lsi1 = lsi_model[tfidf_model[dictionary.doc2bow(report1)]] vec_lsi2 = lsi_model[tfidf_model[dictionary.doc2bow(report2)]] sen1Corp = [dictionary.doc2bow(sent) for sent in sentences1] sen2Corp = [dictionary.doc2bow(sent) for sent in sentences2] vec_lsis1 = lsi_model[tfidf_model[sen1Corp]] vec_lsis2 = lsi_model[tfidf_model[sen2Corp]] # print corpus.num_terms # ind = gensim.similarities.MatrixSimilarity(vec_lsis1,num_features=corpus.num_terms) ind = gensim.similarities.MatrixSimilarity(vec_lsis1, num_features=10) # similarity table for i in vec_lsis2: sCom.append(ind[i]) missing = [0 for s in vec_lsis1] # obtain correct sentence i = 0 # correction is a wrong sentence but has close meaning # to another sentence therefore we can give a suggestion for a correction output = {"missing": 0, "corrections": 0, "wrong": 0, "correct": 0} for col in sCom: aboveTopThreshold = False # aboveMedThreshold = False j = 0 correction = "" bestSim = 0 for sim in col: if sim > threshold: aboveTopThreshold = True elif sim > threshold * 0.9: correction = sent1[j] aboveMedThreshold = True if sim > bestSim: bestSim = sim if missing[j] < sim: missing[j] = sim j += 1 if aboveTopThreshold: # s = str(bestSim) s = "n\t" + sent2[i] + "\t" output["correct"] += 1 print s elif aboveMedThreshold: # s = str(bestSim) s = "c\t" + sent2[i] + "\t" + correction output["corrections"] += 1 else: # s = str(bestSim) s = "w\t" + sent2[i] + "\t" output["wrong"] += 1 print s # else: # s = str(bestSim)+"e\t"+sent2[i]+"\t" # output['extras'] += 1 # print s i += 1 i = 0 for k in missing: if k <= threshold: # s = str(k) s = "m\t" + sent1[i] + "\t" output["missing"] += 1 print s i += 1 # a correction is not considered missing or wrong output["missing"] -= output["corrections"] return output