def run_vsm(model_path, results_path): csv.field_size_limit(9223372036854775807) with open(os.getcwd() + "/CORD-19/inverse_list.csv", encoding = 'utf8') as inv_list_file, \ open(os.getcwd() + results_path, mode= 'w',encoding = 'utf8', newline = '') as res_vsm, \ open(os.getcwd()+ '/CORD-19/preprocessed_queries.csv', encoding = 'utf8') as query_file: inv_list_reader = csv.DictReader(inv_list_file) query_reader = csv.DictReader(query_file) with open(os.getcwd() + model_path, encoding='utf8') as bow_file: bow_reader = csv.DictReader(bow_file) docs = [bow['abs_model'] for bow in bow_reader] avdoclength = avdl(docs) print(avdoclength) for query_line in query_reader: scores = {} rank = 1 with open(os.getcwd() + model_path, encoding='utf8') as bow_file: bow_reader = csv.DictReader(bow_file) for bow in bow_reader: score = bm25(eval(query_line['narrative']), bow['abs_model'], inv_list_file, avdoclength) scores[bow['id']] = score sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) for [key, value] in sorted_scores: #query-id Q0 document-id rank score STANDARD res_vsm.write('{} Q0 {} {} {} STANDARD \n'.format( query_line['id'], key, rank, str(value))) rank += 1
def search(query, metric): query = preprocess(query) if metric == 'tfidf': result = tfidf(query) elif metric == 'bm25': result = bm25(query) elif metric == 'fasttext': result = fasttext(query) elif metric == 'elmo': result = elmo(query) return result
print("Done") f = open('Tweets and summary.txt', 'w') # for every tweet for i in range(0, len(tags_ANI)): complete_query, similar_articles = extract_articles(tags_ANI[i], webhose_tokens) if similar_articles == "@@NO_DOCS_AVAILABLE@@": f.write(tweets_ANI[i] + "\n") f.write(complete_query + "\n") f.write("ERROR: Could not extract articles for the given tweet"+ "\n") f.close() continue final_articles = bm25.bm25(complete_query, similar_articles) summary_breaking_news = summary.bm25(complete_query, final_articles) f.write(tweets_ANI[i] + "\n") f.write(complete_query + "\n") f.write(summary_breaking_news + "\n") f.write("----------------------------------------------------\n") print("----------------------------------------------------\n") print(tweets_ANI[i]) print(summary_breaking_news) # break print("DOne summarizing") f.close() # @PTI_News
def nouns(index): f = open("..\\sentimental_analysis\\docs\\" + str(index) + ".txt", "r", encoding='utf-8') doc = [] lines = f.readlines() for line in lines: line.replace("\n", "") lines = "".join(lines) #Get title and content for single doc summarization f.seek(0, 0) title = f.readline() ##print("OLD title:",title) # title = [s.rstrip() for s in title] # #print("title:",title) content = f.readlines() content = "".join(content) content.replace("\\n", "") content = [content] # for c in content: # c.replace("\\n","") temp = [] temp.append([title]) temp.append(content) doc.append(temp) # blob = TextBlob(lines) # #print("TEXT BL:",list(set(blob.noun_phrases))) ##print("REL :",relWords) text = nltk.word_tokenize(lines) tags = nltk.pos_tag(text) relWords = [] places = GeoText(lines) cities = places.cities countries = places.countries relWords = cities + countries temp = "" i = 0 while i < len(tags): temp = "" flag = False index = i ##print("t:",tags[index]) while index < len(tags) and tags[index][1] == 'NNP': if flag: temp += " " + tags[index][0] else: temp += tags[index][0] index = index + 1 flag = True relWords.append(temp) temp = "" if flag is False: i = i + 1 else: i = index relWords = list(set(relWords)) stop_words = [ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", "Rs", "Cr", "Lakh", "Thousand", "Crore", "Kg", "Gram", "Watch", "Sources", "Watch live", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", "I", "We", "He", "She", "It", "You", "They", "We're" ] puncts = ["{", "}", "[", "]", "(", ")"] ##print("REL WORDS1 ",relWords) words = [] for i, word in enumerate(relWords): if word != '' and word not in stop_words: ##print("i:",word) ##print("word: ",word) words.append(word) for index, word in enumerate(words): for p in puncts: if p in word: words[index] = words[index].replace(p, "") ##print("WORDS: ",words) new_list = [] flag = False for i in range(len(words)): for j in range(len(words)): if i == j: continue if words[i] in words[j]: flag = True break if flag == True: flag = False continue else: new_list.append(words[i]) relWords = copy.copy(new_list) #print("REL :",relWords) query = " ".join(relWords) ##print("QUERY ",query) def call_bm25(): #allDocs=[] # for word in relWords: allDocs = Parallel(n_jobs=3)(delayed(RecomParallel.Scrape)(word) for word in relWords) # for word in relWords: # #print ("WORD IS ",word) # similarDocs = RecomParallel.Scrape(word) # allDocs.append(similarDocs) return allDocs allDocs = call_bm25() #4D list --- [ [ [[title],[text]],[[title],[text]] for query1 ] [[[]]] ] #print("ALL DOc ") # result=RecomSemantic.rec(starred_doc,summaries) # #print("\n\nRES:",result) #for doc in allDocs: #print("\n",doc) # f1.write(doc) # f1.write("\n") temp = [] for queryList in allDocs: for eachDoc in queryList: temp.append(eachDoc) allDocs = temp final = bm25.bm25(query, allDocs) #print("\nFINAL ") f = open("Summary.txt", "w") f.write(final) f.close() print("STR:", final)
def call_bm25(self): if len(self.final_doc) == 0: return "Could not retrieve articles" summary = bm25.bm25(self.query, self.final_doc) return summary
c_weight = int(sys.argv[3]) / 100 with open(queries, 'r') as fr: q_lines = fr.readlines() qid_to_documents = {} for line in q_lines: jl = json.loads(line) just_doc_ids = [] for doc_item in jl['documents']: just_doc_ids.append(doc_item['doc_id']) qid_to_documents[jl['qid']] = just_doc_ids print("running bm25...") start = time.time() results = bm25.bm25(queries) print("done") """ with open('data/query_rankings.csv', 'r') as fr: csv_lines = fr.readlines() """ total_docs = 0 print("reranking...") curr_csv_ind = 0 """ if curr_csv_ind == len(csv_lines): csv_lines.append('raw_bm25,relevance,gender_score,\n') else: csv_lines[curr_csv_ind] = csv_lines[curr_csv_ind][:-1] csv_lines[curr_csv_ind] += 'disp_impact_balanced_per_query,relevance,gender_score,\n'
def call_bm25(self): smry = bm25.bm25(self.query, self.final_doc) #print(len(self.final_doc)) return smry
def call_bm25(self): if (len(self.final_doc) == 0): return [] similar_docs = bm25.bm25(self.query, self.final_doc) return similar_docs