def articut4PatentBot(categoryFILE, inputSTR): with open("account.info", encoding="utf-8") as f: userinfoDICT = json.loads(f.read()) articut = Articut(username=userinfoDICT["username"], apikey=userinfoDICT["apikey"], level="lv1") # 讀入對應類別的專利文本 #patentDICT = patent[categoryFILE] #patent_file = categoryFILE + '.json' with open("Dataset\patent.json", encoding='utf-8') as f: AllpatentDICT = json.loads(f.read()) patentDICT = AllpatentDICT[categoryFILE] CertificateNumber = list(patentDICT.keys()) # 接收使用者輸入的專利範圍 userSTR = inputSTR.replace(" ", "").replace("\n", "") userResultDICT = articut.parse(userSTR) # 將類別中的專利範圍全部比對一次 VerbCosineSimilarity = [] NounCosineSimilarity = [] TFIDFCosineSimilarity = [] for k in patentDICT.values(): STR = k.replace(" ", "").replace("\n", "") STRResultDICT = articut.parse(STR) # 取得「動詞」做為特徵列表 patentVerbLIST = articut.getVerbStemLIST(STRResultDICT) userVerbLIST = articut.getVerbStemLIST(userResultDICT) # 利用 Counter() 模組計算每個動詞出現的次數 patentCOUNT = Counter(wordExtractor(patentVerbLIST, unify=True)) userCOUNT = Counter(wordExtractor(userVerbLIST, unify=True)) # 計算 [專利文本 vs. 使用者輸入文本] 的餘弦相似度 patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT) VerbCosineSimilarity.append(patent2userSIM) # 取得「名詞」做為特徵列表 patentNounLIST = articut.getNounStemLIST(STRResultDICT) userNounLIST = articut.getNounStemLIST(userResultDICT) # 利用 Counter() 模組計算每個名詞出現的次數 patentCOUNT = Counter(wordExtractor(patentNounLIST, unify=True)) userCOUNT = Counter(wordExtractor(userNounLIST, unify=True)) # 計算 [專利文本 vs. 使用者輸入文本] 的餘弦相似度 patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT) NounCosineSimilarity.append(patent2userSIM) # 取得「TF-IDF」做為特徵列表 patentTFIDFLIST = articut.analyse.extract_tags(STRResultDICT) userTFIDFLIST = articut.analyse.extract_tags(userResultDICT) # 利用 Counter() 模組計算每個 TF-IDF 特徵詞出現的次數 patentCOUNT = Counter(patentTFIDFLIST) userCOUNT = Counter(userTFIDFLIST) # 計算 [專利文本 vs. 使用者輸入文本] 的 TF-IDF 餘弦相似度 patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT) TFIDFCosineSimilarity.append(patent2userSIM) ArticutresultDICT = {} max_Verb = max(VerbCosineSimilarity) v = VerbCosineSimilarity.index(max_Verb) # print("[專利文本 vs. 使用者輸入文本] 的動詞餘弦相似度:{}".format(VerbCosineSimilarity)) # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_Verb, CertificateNumber[v])) ArticutresultDICT["Verb"] = {} ArticutresultDICT["Verb"][CertificateNumber[v]] = max_Verb max_Noun = max(NounCosineSimilarity) n = NounCosineSimilarity.index(max_Noun) # print("[專利文本 vs. 使用者輸入文本] 的名詞餘弦相似度:{}".format(NounCosineSimilarity)) # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_Noun, CertificateNumber[n])) ArticutresultDICT["Noun"] = {} ArticutresultDICT["Noun"][CertificateNumber[n]] = max_Noun max_TFIDF = max(TFIDFCosineSimilarity) t = TFIDFCosineSimilarity.index(max_TFIDF) # print("[專利文本 vs. 使用者輸入文本] 的 TF-IDF 特徵詞餘弦相似度:{}".format(TFIDFCosineSimilarity)) # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_TFIDF, CertificateNumber[t])) ArticutresultDICT["TFIDF"] = {} ArticutresultDICT["TFIDF"][CertificateNumber[t]] = max_TFIDF ArticutresultDICT["All_Max"] = {} m = max(max_Verb, max_Noun, max_TFIDF) if m == max_Noun: ArticutresultDICT["All_Max"][CertificateNumber[n]] = [m, "名詞"] elif m == max_Verb: ArticutresultDICT["All_Max"][CertificateNumber[v]] = [m, "動詞"] elif m == max_TFIDF: ArticutresultDICT["All_Max"][CertificateNumber[t]] = [m, "TF-IDF"] return ArticutresultDICT
#Articut 的 POS 標記斷詞結果在 "result_pos" 中。 print(result["result_pos"]) print("有幾個動詞的「研究」呢?") verbLIST = articut.getVerbStemLIST(result, indexWithPOS=False) verbCounter = 0 for sentence in verbLIST: #verbLIST 中,每「一個」句子是一個獨立的 list,故要先進入 sentence,再計算其中的 "研究" for v in sentence: if v[-1] == "研究": print("發現動詞「研究」 ,位於原句的 {}~{} 位置".format(v[0], v[1])) verbCounter = verbCounter + 1 print("共有 {} 個『研究』是動詞。".format(verbCounter)) print("有幾個名詞的「研究」呢?") nounLIST = articut.getNounStemLIST(result, indexWithPOS=False) nounCounter = 0 for sentence in nounLIST: #nounLIST 中,每「一個」句子是一個獨立的 list,故要先進入 sentence,再計算其中的 "研究" for n in sentence: if n[-1] == "研究": print("發現名詞「研究」 ,位於原句的 {}~{} 位置".format(n[0], n[1])) nounCounter = nounCounter + 1 print("共有 {} 個『研究』是名詞。".format(verbCounter))