Beispiel #1
0
def run_vsm(model_path, results_path):
    csv.field_size_limit(9223372036854775807)
    with open(os.getcwd() + "/CORD-19/inverse_list.csv", encoding = 'utf8') as inv_list_file, \
         open(os.getcwd() + results_path, mode= 'w',encoding = 'utf8', newline = '') as res_vsm, \
         open(os.getcwd()+ '/CORD-19/preprocessed_queries.csv', encoding = 'utf8') as query_file:
        inv_list_reader = csv.DictReader(inv_list_file)
        query_reader = csv.DictReader(query_file)
        with open(os.getcwd() + model_path, encoding='utf8') as bow_file:
            bow_reader = csv.DictReader(bow_file)
            docs = [bow['abs_model'] for bow in bow_reader]
            avdoclength = avdl(docs)
            print(avdoclength)

        for query_line in query_reader:
            scores = {}
            rank = 1
            with open(os.getcwd() + model_path, encoding='utf8') as bow_file:
                bow_reader = csv.DictReader(bow_file)
                for bow in bow_reader:
                    score = bm25(eval(query_line['narrative']),
                                 bow['abs_model'], inv_list_file, avdoclength)
                    scores[bow['id']] = score

            sorted_scores = sorted(scores.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
            for [key, value] in sorted_scores:
                #query-id Q0 document-id rank score STANDARD
                res_vsm.write('{} Q0 {} {} {} STANDARD \n'.format(
                    query_line['id'], key, rank, str(value)))
                rank += 1
Beispiel #2
0
def search(query, metric):
    query = preprocess(query)

    if metric == 'tfidf':
        result = tfidf(query)
    elif metric == 'bm25':
        result = bm25(query)
    elif metric == 'fasttext':
        result = fasttext(query)
    elif metric == 'elmo':
        result = elmo(query)

    return result
    print("Done")
    f = open('Tweets and summary.txt', 'w')

    # for every tweet
    for i in range(0, len(tags_ANI)):


        complete_query, similar_articles = extract_articles(tags_ANI[i], webhose_tokens)

        if similar_articles == "@@NO_DOCS_AVAILABLE@@":
            f.write(tweets_ANI[i] + "\n")
            f.write(complete_query + "\n")
            f.write("ERROR: Could not extract articles for the given tweet"+ "\n")
            f.close()
            continue

        final_articles = bm25.bm25(complete_query, similar_articles)
        summary_breaking_news = summary.bm25(complete_query, final_articles)
        f.write(tweets_ANI[i] + "\n")
        f.write(complete_query + "\n")
        f.write(summary_breaking_news + "\n")
        f.write("----------------------------------------------------\n")
        print("----------------------------------------------------\n")
        print(tweets_ANI[i])
        print(summary_breaking_news)
        # break
    print("DOne summarizing")
    f.close()

# @PTI_News
Beispiel #4
0
def nouns(index):
    f = open("..\\sentimental_analysis\\docs\\" + str(index) + ".txt",
             "r",
             encoding='utf-8')
    doc = []
    lines = f.readlines()

    for line in lines:
        line.replace("\n", "")

    lines = "".join(lines)

    #Get title and content for single doc summarization
    f.seek(0, 0)
    title = f.readline()
    ##print("OLD title:",title)
    # title = [s.rstrip() for s in title]
    # #print("title:",title)

    content = f.readlines()
    content = "".join(content)
    content.replace("\\n", "")
    content = [content]
    # for c in content:
    # 	c.replace("\\n","")
    temp = []
    temp.append([title])
    temp.append(content)
    doc.append(temp)

    # blob = TextBlob(lines)
    # #print("TEXT BL:",list(set(blob.noun_phrases)))
    ##print("REL :",relWords)

    text = nltk.word_tokenize(lines)
    tags = nltk.pos_tag(text)

    relWords = []
    places = GeoText(lines)
    cities = places.cities
    countries = places.countries
    relWords = cities + countries
    temp = ""
    i = 0
    while i < len(tags):

        temp = ""
        flag = False
        index = i
        ##print("t:",tags[index])
        while index < len(tags) and tags[index][1] == 'NNP':
            if flag:
                temp += " " + tags[index][0]

            else:
                temp += tags[index][0]

            index = index + 1

            flag = True
        relWords.append(temp)
        temp = ""
        if flag is False:
            i = i + 1
        else:
            i = index

    relWords = list(set(relWords))

    stop_words = [
        "January", "February", "March", "April", "May", "June", "July",
        "August", "September", "October", "November", "December", "Rs", "Cr",
        "Lakh", "Thousand", "Crore", "Kg", "Gram", "Watch", "Sources",
        "Watch live", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
        "Saturday", "Sunday", "I", "We", "He", "She", "It", "You", "They",
        "We're"
    ]
    puncts = ["{", "}", "[", "]", "(", ")"]

    ##print("REL WORDS1 ",relWords)
    words = []
    for i, word in enumerate(relWords):
        if word != '' and word not in stop_words:
            ##print("i:",word)
            ##print("word: ",word)
            words.append(word)

    for index, word in enumerate(words):
        for p in puncts:
            if p in word:
                words[index] = words[index].replace(p, "")

    ##print("WORDS: ",words)
    new_list = []
    flag = False
    for i in range(len(words)):
        for j in range(len(words)):
            if i == j:
                continue
            if words[i] in words[j]:
                flag = True
                break
        if flag == True:
            flag = False
            continue
        else:
            new_list.append(words[i])

    relWords = copy.copy(new_list)
    #print("REL :",relWords)
    query = " ".join(relWords)

    ##print("QUERY ",query)
    def call_bm25():
        #allDocs=[]
        # for word in relWords:

        allDocs = Parallel(n_jobs=3)(delayed(RecomParallel.Scrape)(word)
                                     for word in relWords)
        # for word in relWords:
        # 	#print ("WORD IS ",word)
        # 	similarDocs = RecomParallel.Scrape(word)
        # 	allDocs.append(similarDocs)
        return allDocs

    allDocs = call_bm25()

    #4D list ---  [ [ [[title],[text]],[[title],[text]] for query1 ]     [[[]]]   ]

    #print("ALL DOc  ")
    # result=RecomSemantic.rec(starred_doc,summaries)
    # #print("\n\nRES:",result)
    #for doc in allDocs:
    #print("\n",doc)
    # f1.write(doc)
    # f1.write("\n")
    temp = []
    for queryList in allDocs:
        for eachDoc in queryList:
            temp.append(eachDoc)

    allDocs = temp
    final = bm25.bm25(query, allDocs)
    #print("\nFINAL  ")
    f = open("Summary.txt", "w")
    f.write(final)
    f.close()
    print("STR:", final)
Beispiel #5
0
 def call_bm25(self):
     if len(self.final_doc) == 0:
         return "Could not retrieve articles"
     
     summary = bm25.bm25(self.query, self.final_doc)
     return summary
Beispiel #6
0
    c_weight = int(sys.argv[3]) / 100

    with open(queries, 'r') as fr:
        q_lines = fr.readlines()

    qid_to_documents = {}
    for line in q_lines:
        jl = json.loads(line)
        just_doc_ids = []
        for doc_item in jl['documents']:
            just_doc_ids.append(doc_item['doc_id'])
        qid_to_documents[jl['qid']] = just_doc_ids

    print("running bm25...")
    start = time.time()
    results = bm25.bm25(queries)
    print("done")
    """
    with open('data/query_rankings.csv', 'r') as fr:
        csv_lines = fr.readlines()
    """
    total_docs = 0

    print("reranking...")
    curr_csv_ind = 0
    """
    if curr_csv_ind == len(csv_lines):
        csv_lines.append('raw_bm25,relevance,gender_score,\n')
    else:
        csv_lines[curr_csv_ind] = csv_lines[curr_csv_ind][:-1]
        csv_lines[curr_csv_ind] += 'disp_impact_balanced_per_query,relevance,gender_score,\n' 
	def call_bm25(self):
		
		smry = bm25.bm25(self.query, self.final_doc)
		#print(len(self.final_doc))
		return smry
Beispiel #8
0
 def call_bm25(self):
     if (len(self.final_doc) == 0):
         return []
     similar_docs = bm25.bm25(self.query, self.final_doc)
     return similar_docs