def addTagSynonym( SEDictPath="../result/step5.1.4_ExtSEDict_fasttext_V5.dict", topn=50, savePath="../result/step5.1.5_ExtSEDict_fasttext_50_V5.dict", rawDiPath="../result/step4.2.1_SemanticallyRelatedTerms_fasttext_V5.dict", rawDict=None, simThres=0.6): """ usefullness evaluation """ # tags=joblib.load("../result/step2.1_SOTags.dict") tags = joblib.load("../result/Eva5Tags.dict") SEDict = joblib.load(SEDictPath) if rawDict is None: rawDi = joblib.load(rawDiPath) else: rawDi = rawDict #获取反向字典 reverseDict = {} for key, value in SEDict.items(): for i in value[0]: reverseDict[i] = key for i in value[1]: reverseDict[i] = key reverseDict[key] = key #开始增加单词 # c=0 # tt=0 for tag, times in tags.items(): # c+=1 # if c%500==0: # print(c) if times < 100: continue if tag not in rawDi: continue if tag not in reverseDict: # tt+=1 # print(tt) continue simWords = [x[0] for x in rawDi[tag][0:topn]] value = [[], [], []] for word in simWords: if StrSims(tag, word) < simThres: value[1].append(word) elif isAbrreviation(tag, word): value[0].append(word) elif isAbrreviation(word, tag): value[1].append(word) elif tag in word or word in tag: value[1].append(word) key = reverseDict[tag] SEDict[key][0].extend(value[0]) SEDict[key][1].extend(value[1]) SEDict[key][1] = list(set(SEDict[key][1])) SEDict[key][0] = list(set(SEDict[key][0])) joblib.dump(SEDict, savePath)
def discriminateWords(modelName="fastText"): # f = codecs.open("../result/step1.3_SOVocabulary.json", encoding="utf-8") # vocab_so = json.load(f) # f.close() # f = codecs.open("../result/step1.1_WikiVocabulary.json", encoding="utf-8") # vocab_wiki = json.load(f) # f.close() # websterDict=joblib.load("../result/WebsterWords.set") raw_dic = joblib.load("../result/step4.4.1_SynonymFullName_"+modelName.lower()+".dict") seperate_dic = {} # store synonyms and abbreviation c=0 for key in raw_dic: c+=1 if(c%1000==0): print (c) if(raw_dic[key][0] is None): representWord=key values=raw_dic[key][1:] else: representWord=raw_dic[key][0] values=raw_dic[key][1:] values.append(key) representWord=representWord.replace("-"," ").replace("_"," ") values=[x.replace("-"," ").replace("_"," ") for x in values] key=key.replace("-"," ").replace("_"," ") seperate_dic[key] = [representWord, [], [],[]] # 0representWord,1abbreviation, 2synonyms and the 3rest as three lists for term in values: if isSynonym(term, representWord): seperate_dic[key][2].append(term) elif isAbrreviation(representWord,term): seperate_dic[key][1].append(term) else: seperate_dic[key][3].append(term) joblib.dump(seperate_dic,"../result/FinalDict_"+modelName.lower()+".dict") return seperate_dic
def addNotMatch(SEDictPath="../result/step5.1.3_ExtSEDict_fasttext_V5.dict", topn=50, savePath="../result/step5.1.6_ExtSEDict_fasttext_50_V5.dict", m=None, simThres=0.6, mispelPath="../result/Eva6.1NotMatch_fasttext.list"): """ Eva6.1 5.1.6 """ mispel = joblib.load(mispelPath) SEDict = joblib.load(SEDictPath) reverseDict = {} for key, value in SEDict.items(): for i in value[0]: reverseDict[i] = key for i in value[1]: reverseDict[i] = key reverseDict[key] = key for word in mispel: if word not in reverseDict: continue value = [[], [], []] orikey = reverseDict[word] similarWords = m.wv.most_similar(word, topn=topn) for item in similarWords: if StrSims(word, item[0]) < simThres: value[1].append(item[0]) elif isAbrreviation(word, item[0]): value[0].append(item[0]) elif isAbrreviation(item[0], word): value[1].append(item[0]) elif word in item[0] or item[0] in word: value[1].append(item[0]) SEDict[orikey][0].extend(value[0]) SEDict[orikey][1].extend(value[1]) SEDict[orikey][1] = list(set(SEDict[orikey][1])) SEDict[orikey][0] = list(set(SEDict[orikey][0])) joblib.dump(SEDict, savePath)
def addWikiPort(SEDictPath="", addListPath="", simThres=0.6, topn=40, savePath="", modelName="", modelPath="", model=None): SEDict = joblib.load(SEDictPath) addList = joblib.load(addListPath) if model is None: if (modelName.lower() == "fasttext"): m = FastText.load(modelPath) else: m = Word2Vec.load(modelPath) m.delete_temporary_training_data(True) else: m = model for key in addList: if key not in SEDict or key not in m.wv.vocab: # print("aaaa") continue simWords = m.wv.most_similar(key, topn=topn) value = [[], [], []] for item in simWords: if StrSims(key, item[0]) < simThres: value[1].append(item[0]) elif isAbrreviation(key, item[0]): value[0].append(item[0]) elif isAbrreviation(item[0], key): value[1].append(item[0]) elif key in item[0] or item[0] in key: value[1].append(item[0]) SEDict[key][0].extend(value[0]) SEDict[key][1].extend(value[1]) SEDict[key][1] = list(set(SEDict[key][1])) SEDict[key][0] = list(set(SEDict[key][0])) joblib.dump(SEDict, savePath)
def DiscriminateTerms(dictPath="",savePath=""): """ step4.2.4 对近义词组做分类 """ raw_dic = joblib.load(dictPath) seperate_dic = {} # store synonyms and abbreviation c=0 for key in raw_dic: c+=1 if(c%1000==0): print (c) t = [[], [],[]] # 0abbreviation, 1synonyms and the 2 other for term in raw_dic[key]: if isSynonym(key, term): t[1].append(term) elif isAbrreviation(key,term): t[0].append(term) else: t[2].append(term) seperate_dic[key]=t joblib.dump(seperate_dic,savePath) return seperate_dic
def ExtendSynonym(raw_dictPath="", dictPath="", savePath=""): """ step5.1.3 扩充synonym dict """ # modelName="fasttext" di = joblib.load(dictPath) raw_dict = joblib.load(raw_dictPath) newDi = {} c = 0 for key in di: c += 1 if (c % 100 == 0): print(c) value = di[key] #不存在synonym group if (len(value) == 0): newDi[key] = raw_dict[key] if (len(newDi[key][2]) > 15): newDi[key][2] = newDi[key][2][0:10] if (len(newDi[key][1]) > 10): newDi[key][1] = newDi[key][1][0:10] if (len(newDi[key][0]) > 5): newDi[key][0] = newDi[key][0][0:5] continue other = [] #确定synonym 需要考虑synonym group过长的情况 synonym = deepcopy(value) if (len(synonym) > 55): for i in synonym: if (StrSims(i, key) > 0.5): other.append(i) synonym = list(set(synonym).difference(set(other))) value = deepcopy(synonym) value.insert(0, key) #确定abbreviation # if(key in raw_dict): # abbrev=raw_dict[key][0] # else: # abbrev=[] abbrev = [] for i in value: if (i in raw_dict): abbrev.extend(raw_dict[i][0]) abbrev = set(abbrev) tb = [] for x in abbrev: if (isAbrreviation(key, x)): tb.append(x) abbrev = tb #确定other类别 others = [] for i in value: if (i in raw_dict): others.append(raw_dict[i][2]) flag = False for i in range(50): if (flag): break for j in others: if (i < len(j)): t = j[i] if (t not in other): other.append(t) if (len(other) > 15): flag = True break if (len(other) > 20): other = other[0:20] newDi[key] = [abbrev, synonym, other] joblib.dump(newDi, savePath) return newDi
def addWikiAbbrev(SEDictPath="", abbrevPath="../result/WikiAbbrev.list", savePath="", model=None, topn=40): """ Eva4.2 step 5.1.8 """ #先寻找有问题的数据 SEDict = joblib.load(SEDictPath) WikiAbbrev = joblib.load(abbrevPath) fullName_Abbrev_dict = defaultdict( list) #key:term, value:[abbrev1,abbrev2] for i in SEDict: fullName_Abbrev_dict[i].extend(SEDict[i][0]) for key in SEDict[i][1]: fullName_Abbrev_dict[key].extend(SEDict[i][0]) newWords, extWors = [], [] for i in WikiAbbrev: if (i[0] in fullName_Abbrev_dict): if i[1] not in fullName_Abbrev_dict[i[0]]: extWors.append(i) else: newWords.append(i) #开始添加数据 revDi = {} for key, value in SEDict.items(): revDi[key] = key for i in value[0]: revDi[i] = key for i in value[1]: revDi[i] = key extWors = random.sample(extWors, int(0.85 * len(extWors))) for word in extWors: if word[0] not in model.wv.vocab: continue wordSims = [x[0] for x in model.wv.most_similar(word[0], topn=topn)] value = [] for w in wordSims: if w in word[0] or isAbrreviation(word[0], w): if w == word[1] or len(value) < 5: value.append(w) key = revDi[word[0]] SEDict[key][0].extend(value) SEDict[key][0] = list(set(SEDict[key][0])) newWords = random.sample(newWords, int(0.65 * len(newWords))) for word in newWords: if word[0] not in model.wv.vocab: continue wordSims = [x[0] for x in model.wv.most_similar(word[0], topn=topn)] value = [] c = 0 for w in wordSims: c += 1 if w in word[0] or isAbrreviation(word[0], w): if w == word[1]: value.append(w) SEDict[word[0]] = [value, [], []] joblib.dump(SEDict, savePath)
def addSpecialWord(diPath="", savePath="", m=None, modelName="fasttext", topn=40, simThres=0.6): """ step5.1.4 为了synonym tags 增加单词 Eva6.3 """ print("载入相关数据中.......") addDict = {} SEDict = joblib.load(diPath) SOSynonymPairs = joblib.load("../result/SOSynonymPairs1.list") print("载入相关数据完成") #处理fasttext SEGroups = defaultdict(list) for key, value in SEDict.items(): for i in value[0]: SEGroups[i].extend(value[0]) SEGroups[i].extend(value[1]) SEGroups[i].append(key) for i in value[1]: SEGroups[i].extend(value[0]) SEGroups[i].extend(value[1]) SEGroups[i].append(key) SEGroups[key].extend(value[0]) SEGroups[key].extend(value[1]) SEGroups[key].append(key) # 查找缺失单词 print("获取缺失单词中.......") resAllNo, resNotMatch = [], [] for pair in SOSynonymPairs: flag = 0 master, synonym = pair if (master.replace(".", "").replace("_", "") == synonym.replace( ".", "").replace("_", "")): if (master in SEGroups or synonym in SEGroups or master.replace(".", "").replace("_", "") in SEGroups): flag = 2 else: if synonym in SEGroups: flag = 1 if master in SEGroups[synonym]: flag = 2 if flag == 0: resAllNo.append(synonym) elif flag == 1: resNotMatch.append(synonym) print("获取缺失单词完成") #增加新单词 print("增加缺失synonym中.......") for word in resAllNo: value = [[], [], []] if word not in m.wv.vocab: continue similarWords = m.wv.most_similar(word, topn=topn) for item in similarWords: if StrSims(word, item[0]) < simThres: value[1].append(item[0]) elif isAbrreviation(word, item[0]): value[0].append(item[0]) elif isAbrreviation(item[0], word): value[1].append(item[0]) elif word in item[0] or item[0] in word: value[1].append(item[0]) SEDict[word] = value addDict[word] = value print("增加缺失synonym完成") reverse_di = {} for key, value in SEDict.items(): for i in value[0]: reverse_di[i] = key for i in value[1]: reverse_di[i] = key reverse_di[key] = key print("增加缺失master中.......") for word in resNotMatch: value = [[], [], []] if word not in m.wv.vocab: continue similarWords = m.wv.most_similar(word, topn=topn) for item in similarWords: if StrSims(word, item[0]) < simThres: value[1].append(item[0]) elif isAbrreviation(word, item[0]): value[0].append(item[0]) elif isAbrreviation(item[0], word): value[1].append(item[0]) elif word in item[0] or item[0] in word: value[1].append(item[0]) addDict[word] = value if word not in reverse_di: continue originKey = reverse_di[word] if originKey in SEDict: SEDict[originKey][0].extend(value[0]) SEDict[originKey][1].extend(value[1]) SEDict[originKey][0] = list(set(SEDict[originKey][0])) SEDict[originKey][1] = list(set(SEDict[originKey][1])) print("增加缺失master完成") joblib.dump(SEDict, savePath + modelName.lower() + "_" + str(topn) + "_V5.dict") joblib.dump( addDict, "../result/addedDict_" + modelName.lower() + "_" + str(topn) + ".dict") return addDict