Python bm25 Exemples, recomMatch.bm25 Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : RecomParallel.py Projet : SaiHarini/News_Categorisation

 def call_bm25(self):
     #print(len(self.final_doc))
     if (len(self.final_doc) == 0):
         return []
     similarDocs = recomMatch.bm25(self.query, self.final_doc)
     return similarDocs

Exemple #2

0

Afficher le fichier

 def call_bm25(self):
     print(len(self.final_doc))
     similarDocs = recomMatch.bm25(self.query, self.final_doc)
     return similarDocs

Exemple #3

0

Afficher le fichier

Fichier : nouns.py Projet : SaiHarini/News_Categorisation

def recom(index):
    f = open("..\\sentimental_analysis\\docs\\" + str(index) + ".txt",
             "r",
             encoding='utf-8')
    doc = []
    lines = f.readlines()

    for line in lines:
        line.replace("\n", "")

    lines = "".join(lines)

    #Get title and content for single doc summarization
    f.seek(0, 0)
    title = f.readline()
    #print("OLD title:",title)
    # title = [s.rstrip() for s in title]
    # print("title:",title)

    content = f.readlines()
    content = "".join(content)
    content.replace("\\n", "")
    content = [content]
    # for c in content:
    #   c.replace("\\n","")
    temp = []
    temp.append([title])
    temp.append(content)
    doc.append(temp)

    #print("DOC IS: ",doc)

    lines = SingleDocSumm.bm25(doc)
    #[ [[title],[text]] ]

    # blob = TextBlob(lines)
    # print("TEXT BL:",list(set(blob.noun_phrases)))
    #print("REL :",relWords)

    text = nltk.word_tokenize(lines)
    tags = nltk.pos_tag(text)

    relWords = []
    places = GeoText(lines)
    cities = places.cities
    countries = places.countries
    relWords = cities + countries
    temp = ""
    i = 0
    while i < len(tags):

        temp = ""
        flag = False
        index = i
        #print("t:",tags[index])
        while index < len(tags) and tags[index][1] == 'NNP':
            if flag:
                temp += " " + tags[index][0]

            else:
                temp += tags[index][0]

            index = index + 1

            flag = True
        relWords.append(temp)
        temp = ""
        if flag is False:
            i = i + 1
        else:
            i = index

    relWords = list(set(relWords))

    stop_words = [
        "January", "February", "March", "April", "May", "June", "July",
        "August", "September", "October", "November", "December", "Rs", "Cr",
        "Lakh", "Thousand", "Crore", "Kg", "Gram", "Watch", "Sources",
        "Watch live", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
        "Saturday", "Sunday", "I", "We", "He", "She", "It", "You", "They",
        "We're"
    ]
    puncts = ["{", "}", "[", "]", "(", ")"]

    #print("REL WORDS1 ",relWords)
    words = []
    for i, word in enumerate(relWords):
        if word != '' and word not in stop_words:
            #print("i:",word)
            #print("word: ",word)
            words.append(word)

    for index, word in enumerate(words):
        for p in puncts:
            if p in word:
                words[index] = words[index].replace(p, "")

    #print("WORDS: ",words)
    new_list = []
    flag = False
    for i in range(len(words)):
        for j in range(len(words)):
            if i == j:
                continue
            if words[i] in words[j]:
                flag = True
                break
        if flag == True:
            flag = False
            continue
        else:
            new_list.append(words[i])

    relWords = copy.copy(new_list)
    #print("REL :",relWords)
    query = " ".join(relWords)

    #print("QUERY ",query)
    def call_bm25():
        #allDocs=[]
        # for word in relWords:
        webhose_tokens = [  # "ecd3d983-093a-4d8d-a7bd-71207dad85a9",
            "e6c1084e-8b63-42cf-bfe3-8ccd24a3a9b1",
            "cb5edba5-48ad-4afa-8d69-ab1ff1092835",
            "de684b78-5b7e-4c6a-a3d4-4efe80f3e1de",
            "39d754b3-64c2-48a2-ba0e-5d147168bda7",
            "191c39da-2f67-4af6-9608-460cc6293108",
            "a4423ef6-f4a3-4231-a0e0-2aa38546facf",
            "e25db40a-f3c3-4d12-92f0-73251732b389"
        ]
        tags_with_tokens = []
        temp = []

        for i in range(len(relWords)):
            temp.append(relWords[i])
            temp.append(webhose_tokens[i % len(webhose_tokens)])
            tags_with_tokens.append(temp)
            temp = []

        allDocs = Parallel(n_jobs=5)(
            delayed(RecomParallel.Scrape)(single_tag_with_token)
            for single_tag_with_token in tags_with_tokens)
        # for word in relWords:
        #   print ("WORD IS ",word)
        #   similarDocs = RecomParallel.Scrape(word)
        #   allDocs.append(similarDocs)
        return allDocs

    allDocs = call_bm25()

    #4D list ---  [ [ [[title],[text]],[[title],[text]] for query1 ]     [[[]]]   ]
    #print("ALL DOc  ")
    # result=RecomSemantic.rec(starred_doc,summaries)
    # print("\n\nRES:",result)
    # for doc in allDocs:
    #   print("\n",doc)
    # f1.write(doc)
    # f1.write("\n")

    temp = []
    for queryList in allDocs:
        for eachDoc in queryList:
            temp.append(eachDoc)

    allDocs = temp
    # for doc in allDocs:
    #   print(doc)
    if len(allDocs) == 0:
        print("Cannot find relevant articles")
        return ""
    final = recomMatch.bm25(query, allDocs)
    summary = []
    index = 1
    for doc in final:
        f = open("" + str(index) + ".txt", "w")
        f.write(doc[0][0])
        f.write("\n\n")
        f.write(doc[1][0])
        f.close()
        index = index + 1

        sdoc = SingleDocSumm.bm25([doc])
        if sdoc not in summary:
            summary.append([sdoc])

    #print("SUMMARIES:....")
    ans = ""
    for index in range(len(summary)):
        #print(summary[index][0],"\n",index,"\n\n\n")
        ans = "Article " + str(index + 1) + ": "
        print(ans)
        print("~~")
        ans = summary[index][0]
        print(ans)
        print("\n")
        print("~~")
    # ans += "Article " + str(index) +  ": \n"
    # ans += summary[index+1][0]
    # print(ans)
    return ans