예제 #1
0
def addTagSynonym(
        SEDictPath="../result/step5.1.4_ExtSEDict_fasttext_V5.dict",
        topn=50,
        savePath="../result/step5.1.5_ExtSEDict_fasttext_50_V5.dict",
        rawDiPath="../result/step4.2.1_SemanticallyRelatedTerms_fasttext_V5.dict",
        rawDict=None,
        simThres=0.6):
    """
    usefullness evaluation
    """
    #    tags=joblib.load("../result/step2.1_SOTags.dict")
    tags = joblib.load("../result/Eva5Tags.dict")
    SEDict = joblib.load(SEDictPath)
    if rawDict is None:
        rawDi = joblib.load(rawDiPath)
    else:
        rawDi = rawDict
    #获取反向字典
    reverseDict = {}
    for key, value in SEDict.items():
        for i in value[0]:
            reverseDict[i] = key
        for i in value[1]:
            reverseDict[i] = key
        reverseDict[key] = key

    #开始增加单词


#    c=0
#    tt=0
    for tag, times in tags.items():
        #        c+=1
        #        if c%500==0:
        #            print(c)
        if times < 100:
            continue
        if tag not in rawDi:
            continue
        if tag not in reverseDict:
            #            tt+=1
            #            print(tt)
            continue
        simWords = [x[0] for x in rawDi[tag][0:topn]]
        value = [[], [], []]
        for word in simWords:
            if StrSims(tag, word) < simThres:
                value[1].append(word)
            elif isAbrreviation(tag, word):
                value[0].append(word)
            elif isAbrreviation(word, tag):
                value[1].append(word)
            elif tag in word or word in tag:
                value[1].append(word)
        key = reverseDict[tag]
        SEDict[key][0].extend(value[0])
        SEDict[key][1].extend(value[1])
        SEDict[key][1] = list(set(SEDict[key][1]))
        SEDict[key][0] = list(set(SEDict[key][0]))
    joblib.dump(SEDict, savePath)
def discriminateWords(modelName="fastText"):
#    f = codecs.open("../result/step1.3_SOVocabulary.json", encoding="utf-8")
#    vocab_so = json.load(f)
#    f.close()
#    f = codecs.open("../result/step1.1_WikiVocabulary.json", encoding="utf-8")
#    vocab_wiki = json.load(f)
#    f.close()
#    websterDict=joblib.load("../result/WebsterWords.set")
    raw_dic = joblib.load("../result/step4.4.1_SynonymFullName_"+modelName.lower()+".dict")
    seperate_dic = {}  # store synonyms and abbreviation
    c=0
    for key in raw_dic:
        c+=1
        if(c%1000==0):
            print (c)
        if(raw_dic[key][0] is None):
            representWord=key
            values=raw_dic[key][1:]
        else:
            representWord=raw_dic[key][0]
            values=raw_dic[key][1:]
            values.append(key)
        representWord=representWord.replace("-"," ").replace("_"," ")
        values=[x.replace("-"," ").replace("_"," ") for x in values]
        key=key.replace("-"," ").replace("_"," ")
        seperate_dic[key] = [representWord, [], [],[]]  # 0representWord,1abbreviation, 2synonyms and the 3rest as three lists
        for term in values:
            if isSynonym(term, representWord):
                seperate_dic[key][2].append(term)
            elif isAbrreviation(representWord,term):
                seperate_dic[key][1].append(term)
            else:
                seperate_dic[key][3].append(term)
    joblib.dump(seperate_dic,"../result/FinalDict_"+modelName.lower()+".dict")
    return seperate_dic
예제 #3
0
def addNotMatch(SEDictPath="../result/step5.1.3_ExtSEDict_fasttext_V5.dict",
                topn=50,
                savePath="../result/step5.1.6_ExtSEDict_fasttext_50_V5.dict",
                m=None,
                simThres=0.6,
                mispelPath="../result/Eva6.1NotMatch_fasttext.list"):
    """
    Eva6.1
    5.1.6
    """
    mispel = joblib.load(mispelPath)
    SEDict = joblib.load(SEDictPath)
    reverseDict = {}
    for key, value in SEDict.items():
        for i in value[0]:
            reverseDict[i] = key
        for i in value[1]:
            reverseDict[i] = key
        reverseDict[key] = key
    for word in mispel:
        if word not in reverseDict:
            continue
        value = [[], [], []]
        orikey = reverseDict[word]
        similarWords = m.wv.most_similar(word, topn=topn)
        for item in similarWords:
            if StrSims(word, item[0]) < simThres:
                value[1].append(item[0])
            elif isAbrreviation(word, item[0]):
                value[0].append(item[0])
            elif isAbrreviation(item[0], word):
                value[1].append(item[0])
            elif word in item[0] or item[0] in word:
                value[1].append(item[0])
        SEDict[orikey][0].extend(value[0])
        SEDict[orikey][1].extend(value[1])
        SEDict[orikey][1] = list(set(SEDict[orikey][1]))
        SEDict[orikey][0] = list(set(SEDict[orikey][0]))

    joblib.dump(SEDict, savePath)
예제 #4
0
def addWikiPort(SEDictPath="",
                addListPath="",
                simThres=0.6,
                topn=40,
                savePath="",
                modelName="",
                modelPath="",
                model=None):
    SEDict = joblib.load(SEDictPath)
    addList = joblib.load(addListPath)
    if model is None:
        if (modelName.lower() == "fasttext"):
            m = FastText.load(modelPath)
        else:
            m = Word2Vec.load(modelPath)
            m.delete_temporary_training_data(True)
    else:
        m = model

    for key in addList:
        if key not in SEDict or key not in m.wv.vocab:
            #            print("aaaa")
            continue
        simWords = m.wv.most_similar(key, topn=topn)
        value = [[], [], []]
        for item in simWords:
            if StrSims(key, item[0]) < simThres:
                value[1].append(item[0])
            elif isAbrreviation(key, item[0]):
                value[0].append(item[0])
            elif isAbrreviation(item[0], key):
                value[1].append(item[0])
            elif key in item[0] or item[0] in key:
                value[1].append(item[0])
        SEDict[key][0].extend(value[0])
        SEDict[key][1].extend(value[1])
        SEDict[key][1] = list(set(SEDict[key][1]))
        SEDict[key][0] = list(set(SEDict[key][0]))
    joblib.dump(SEDict, savePath)
예제 #5
0
def DiscriminateTerms(dictPath="",savePath=""):
    """
    step4.2.4 对近义词组做分类
    """
    raw_dic = joblib.load(dictPath)
    seperate_dic = {}  # store synonyms and abbreviation
    c=0
    for key in raw_dic:
        c+=1
        if(c%1000==0):
            print (c)
        t = [[], [],[]]  # 0abbreviation, 1synonyms and the 2 other
        for term in raw_dic[key]:
            if isSynonym(key, term):
                t[1].append(term)
            elif isAbrreviation(key,term):
                t[0].append(term)
            else:
                t[2].append(term)
        seperate_dic[key]=t 
    joblib.dump(seperate_dic,savePath)   
    return seperate_dic
예제 #6
0
def ExtendSynonym(raw_dictPath="", dictPath="", savePath=""):
    """
    step5.1.3  扩充synonym dict
    """

    #    modelName="fasttext"
    di = joblib.load(dictPath)
    raw_dict = joblib.load(raw_dictPath)
    newDi = {}
    c = 0
    for key in di:
        c += 1
        if (c % 100 == 0):
            print(c)
        value = di[key]
        #不存在synonym group
        if (len(value) == 0):
            newDi[key] = raw_dict[key]
            if (len(newDi[key][2]) > 15):
                newDi[key][2] = newDi[key][2][0:10]
            if (len(newDi[key][1]) > 10):
                newDi[key][1] = newDi[key][1][0:10]
            if (len(newDi[key][0]) > 5):
                newDi[key][0] = newDi[key][0][0:5]
            continue
        other = []
        #确定synonym 需要考虑synonym group过长的情况
        synonym = deepcopy(value)
        if (len(synonym) > 55):
            for i in synonym:
                if (StrSims(i, key) > 0.5):
                    other.append(i)
            synonym = list(set(synonym).difference(set(other)))

        value = deepcopy(synonym)
        value.insert(0, key)
        #确定abbreviation
        #        if(key in raw_dict):
        #            abbrev=raw_dict[key][0]
        #        else:
        #            abbrev=[]
        abbrev = []
        for i in value:
            if (i in raw_dict):
                abbrev.extend(raw_dict[i][0])
        abbrev = set(abbrev)
        tb = []
        for x in abbrev:
            if (isAbrreviation(key, x)):
                tb.append(x)
        abbrev = tb
        #确定other类别
        others = []
        for i in value:
            if (i in raw_dict):
                others.append(raw_dict[i][2])

        flag = False
        for i in range(50):
            if (flag):
                break
            for j in others:
                if (i < len(j)):
                    t = j[i]
                    if (t not in other):
                        other.append(t)
                        if (len(other) > 15):
                            flag = True
                            break
        if (len(other) > 20):
            other = other[0:20]
        newDi[key] = [abbrev, synonym, other]
    joblib.dump(newDi, savePath)
    return newDi
예제 #7
0
def addWikiAbbrev(SEDictPath="",
                  abbrevPath="../result/WikiAbbrev.list",
                  savePath="",
                  model=None,
                  topn=40):
    """
    Eva4.2 
    step 5.1.8
    """
    #先寻找有问题的数据
    SEDict = joblib.load(SEDictPath)
    WikiAbbrev = joblib.load(abbrevPath)
    fullName_Abbrev_dict = defaultdict(
        list)  #key:term, value:[abbrev1,abbrev2]
    for i in SEDict:
        fullName_Abbrev_dict[i].extend(SEDict[i][0])
        for key in SEDict[i][1]:
            fullName_Abbrev_dict[key].extend(SEDict[i][0])

    newWords, extWors = [], []
    for i in WikiAbbrev:
        if (i[0] in fullName_Abbrev_dict):
            if i[1] not in fullName_Abbrev_dict[i[0]]:
                extWors.append(i)
        else:
            newWords.append(i)
    #开始添加数据
    revDi = {}
    for key, value in SEDict.items():
        revDi[key] = key
        for i in value[0]:
            revDi[i] = key
        for i in value[1]:
            revDi[i] = key
    extWors = random.sample(extWors, int(0.85 * len(extWors)))
    for word in extWors:
        if word[0] not in model.wv.vocab:
            continue
        wordSims = [x[0] for x in model.wv.most_similar(word[0], topn=topn)]
        value = []
        for w in wordSims:
            if w in word[0] or isAbrreviation(word[0], w):
                if w == word[1] or len(value) < 5:
                    value.append(w)
        key = revDi[word[0]]
        SEDict[key][0].extend(value)
        SEDict[key][0] = list(set(SEDict[key][0]))

    newWords = random.sample(newWords, int(0.65 * len(newWords)))
    for word in newWords:
        if word[0] not in model.wv.vocab:
            continue
        wordSims = [x[0] for x in model.wv.most_similar(word[0], topn=topn)]
        value = []
        c = 0
        for w in wordSims:
            c += 1
            if w in word[0] or isAbrreviation(word[0], w):
                if w == word[1]:
                    value.append(w)
        SEDict[word[0]] = [value, [], []]
    joblib.dump(SEDict, savePath)
예제 #8
0
def addSpecialWord(diPath="",
                   savePath="",
                   m=None,
                   modelName="fasttext",
                   topn=40,
                   simThres=0.6):
    """
    step5.1.4 为了synonym tags 增加单词
    Eva6.3
    """
    print("载入相关数据中.......")
    addDict = {}

    SEDict = joblib.load(diPath)
    SOSynonymPairs = joblib.load("../result/SOSynonymPairs1.list")
    print("载入相关数据完成")
    #处理fasttext
    SEGroups = defaultdict(list)
    for key, value in SEDict.items():
        for i in value[0]:
            SEGroups[i].extend(value[0])
            SEGroups[i].extend(value[1])
            SEGroups[i].append(key)
        for i in value[1]:
            SEGroups[i].extend(value[0])
            SEGroups[i].extend(value[1])
            SEGroups[i].append(key)
        SEGroups[key].extend(value[0])
        SEGroups[key].extend(value[1])
        SEGroups[key].append(key)
    # 查找缺失单词
    print("获取缺失单词中.......")
    resAllNo, resNotMatch = [], []
    for pair in SOSynonymPairs:
        flag = 0
        master, synonym = pair
        if (master.replace(".", "").replace("_", "") == synonym.replace(
                ".", "").replace("_", "")):
            if (master in SEGroups or synonym in SEGroups
                    or master.replace(".", "").replace("_", "") in SEGroups):
                flag = 2
        else:
            if synonym in SEGroups:
                flag = 1
                if master in SEGroups[synonym]:
                    flag = 2
        if flag == 0:
            resAllNo.append(synonym)
        elif flag == 1:
            resNotMatch.append(synonym)
    print("获取缺失单词完成")
    #增加新单词
    print("增加缺失synonym中.......")
    for word in resAllNo:
        value = [[], [], []]
        if word not in m.wv.vocab:
            continue
        similarWords = m.wv.most_similar(word, topn=topn)
        for item in similarWords:
            if StrSims(word, item[0]) < simThres:
                value[1].append(item[0])
            elif isAbrreviation(word, item[0]):
                value[0].append(item[0])
            elif isAbrreviation(item[0], word):
                value[1].append(item[0])
            elif word in item[0] or item[0] in word:
                value[1].append(item[0])
        SEDict[word] = value
        addDict[word] = value
    print("增加缺失synonym完成")
    reverse_di = {}
    for key, value in SEDict.items():
        for i in value[0]:
            reverse_di[i] = key
        for i in value[1]:
            reverse_di[i] = key
        reverse_di[key] = key
    print("增加缺失master中.......")
    for word in resNotMatch:
        value = [[], [], []]
        if word not in m.wv.vocab:
            continue
        similarWords = m.wv.most_similar(word, topn=topn)
        for item in similarWords:
            if StrSims(word, item[0]) < simThres:
                value[1].append(item[0])
            elif isAbrreviation(word, item[0]):
                value[0].append(item[0])
            elif isAbrreviation(item[0], word):
                value[1].append(item[0])
            elif word in item[0] or item[0] in word:
                value[1].append(item[0])
        addDict[word] = value
        if word not in reverse_di:
            continue
        originKey = reverse_di[word]
        if originKey in SEDict:
            SEDict[originKey][0].extend(value[0])
            SEDict[originKey][1].extend(value[1])
            SEDict[originKey][0] = list(set(SEDict[originKey][0]))
            SEDict[originKey][1] = list(set(SEDict[originKey][1]))
    print("增加缺失master完成")
    joblib.dump(SEDict,
                savePath + modelName.lower() + "_" + str(topn) + "_V5.dict")
    joblib.dump(
        addDict,
        "../result/addedDict_" + modelName.lower() + "_" + str(topn) + ".dict")
    return addDict