Exemple #1
0
def articut4PatentBot(categoryFILE, inputSTR):
    with open("account.info", encoding="utf-8") as f:
        userinfoDICT = json.loads(f.read())

    articut = Articut(username=userinfoDICT["username"], apikey=userinfoDICT["apikey"], level="lv1")

    # 讀入對應類別的專利文本
    #patentDICT = patent[categoryFILE]
    #patent_file = categoryFILE + '.json'
    with open("Dataset\patent.json", encoding='utf-8') as f:
        AllpatentDICT = json.loads(f.read())
        patentDICT = AllpatentDICT[categoryFILE]

    CertificateNumber = list(patentDICT.keys())

    # 接收使用者輸入的專利範圍
    userSTR = inputSTR.replace(" ", "").replace("\n", "")
    userResultDICT = articut.parse(userSTR)

    # 將類別中的專利範圍全部比對一次
    VerbCosineSimilarity = []
    NounCosineSimilarity = []
    TFIDFCosineSimilarity = []
    for k in patentDICT.values():
        STR = k.replace(" ", "").replace("\n", "")
        STRResultDICT = articut.parse(STR)
    
        # 取得「動詞」做為特徵列表
        patentVerbLIST = articut.getVerbStemLIST(STRResultDICT)
        userVerbLIST = articut.getVerbStemLIST(userResultDICT)
        # 利用 Counter() 模組計算每個動詞出現的次數
        patentCOUNT = Counter(wordExtractor(patentVerbLIST, unify=True))
        userCOUNT = Counter(wordExtractor(userVerbLIST, unify=True))
        # 計算 [專利文本 vs. 使用者輸入文本] 的餘弦相似度
        patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT)
        VerbCosineSimilarity.append(patent2userSIM)
    
    
    
        # 取得「名詞」做為特徵列表
        patentNounLIST = articut.getNounStemLIST(STRResultDICT)
        userNounLIST = articut.getNounStemLIST(userResultDICT)
        # 利用 Counter() 模組計算每個名詞出現的次數
        patentCOUNT = Counter(wordExtractor(patentNounLIST, unify=True))
        userCOUNT = Counter(wordExtractor(userNounLIST, unify=True))
        # 計算 [專利文本 vs. 使用者輸入文本] 的餘弦相似度
        patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT)
        NounCosineSimilarity.append(patent2userSIM)
    
    
    
        # 取得「TF-IDF」做為特徵列表
        patentTFIDFLIST = articut.analyse.extract_tags(STRResultDICT)
        userTFIDFLIST = articut.analyse.extract_tags(userResultDICT)
        # 利用 Counter() 模組計算每個 TF-IDF 特徵詞出現的次數
        patentCOUNT = Counter(patentTFIDFLIST)
        userCOUNT = Counter(userTFIDFLIST)
        # 計算 [專利文本 vs. 使用者輸入文本] 的 TF-IDF 餘弦相似度
        patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT)
        TFIDFCosineSimilarity.append(patent2userSIM)


    ArticutresultDICT = {}
    max_Verb = max(VerbCosineSimilarity)
    v = VerbCosineSimilarity.index(max_Verb)
    # print("[專利文本 vs. 使用者輸入文本] 的動詞餘弦相似度:{}".format(VerbCosineSimilarity))
    # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_Verb, CertificateNumber[v]))
    ArticutresultDICT["Verb"] = {}
    ArticutresultDICT["Verb"][CertificateNumber[v]] = max_Verb

    max_Noun = max(NounCosineSimilarity)
    n = NounCosineSimilarity.index(max_Noun)
    # print("[專利文本 vs. 使用者輸入文本] 的名詞餘弦相似度:{}".format(NounCosineSimilarity))
    # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_Noun, CertificateNumber[n]))
    ArticutresultDICT["Noun"] = {}
    ArticutresultDICT["Noun"][CertificateNumber[n]] = max_Noun

    max_TFIDF = max(TFIDFCosineSimilarity)
    t = TFIDFCosineSimilarity.index(max_TFIDF)
    # print("[專利文本 vs. 使用者輸入文本] 的 TF-IDF 特徵詞餘弦相似度:{}".format(TFIDFCosineSimilarity))
    # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_TFIDF, CertificateNumber[t]))
    ArticutresultDICT["TFIDF"] = {}
    ArticutresultDICT["TFIDF"][CertificateNumber[t]] = max_TFIDF

    ArticutresultDICT["All_Max"] = {}
    m = max(max_Verb, max_Noun, max_TFIDF)
    if m == max_Noun:
        ArticutresultDICT["All_Max"][CertificateNumber[n]] = [m, "名詞"]
    elif m == max_Verb:
        ArticutresultDICT["All_Max"][CertificateNumber[v]] = [m, "動詞"]
    elif m == max_TFIDF:
        ArticutresultDICT["All_Max"][CertificateNumber[t]] = [m, "TF-IDF"]


    return ArticutresultDICT
#Articut 的 POS 標記斷詞結果在 "result_pos" 中。
print(result["result_pos"])



print("有幾個動詞的「研究」呢?")
verbLIST = articut.getVerbStemLIST(result, indexWithPOS=False)

verbCounter = 0
for sentence in verbLIST: #verbLIST 中,每「一個」句子是一個獨立的 list,故要先進入 sentence,再計算其中的 "研究"
    for v in sentence:
        if v[-1] == "研究":
            print("發現動詞「研究」 ,位於原句的 {}~{} 位置".format(v[0], v[1]))
            verbCounter = verbCounter + 1

print("共有 {} 個『研究』是動詞。".format(verbCounter))



print("有幾個名詞的「研究」呢?")
nounLIST = articut.getNounStemLIST(result, indexWithPOS=False)

nounCounter = 0
for sentence in nounLIST: #nounLIST 中,每「一個」句子是一個獨立的 list,故要先進入 sentence,再計算其中的 "研究"
    for n in sentence:
        if n[-1] == "研究":
            print("發現名詞「研究」 ,位於原句的 {}~{} 位置".format(n[0], n[1]))
            nounCounter = nounCounter + 1

print("共有 {} 個『研究』是名詞。".format(verbCounter))