Beispiel #1
0
def removeSubstring(
    vlist
):  # e.g. if both "bubble" and "sort" and "bubble sort" exist in Top30 Words,remove"bubble" "sort"
    checklist = vlist[:]

    checklist = list(filter(lambda x: x != 'gon na', checklist))
    for voctuple in vlist:
        splitlist = voctuple[0].split(' ')
        if (len(splitlist) > 1):
            remove_words = []
            for item in splitlist:
                t = [
                    t for t in checklist if text_preprocess.PhraseReduction(
                        t[0]) == text_preprocess.PhraseReduction(item)
                ]
                if (len(t) != 0):
                    remove_words.append(t[0])

            if (len(remove_words) == len(splitlist)
                ):  # all words have single voc corresponding
                for word in remove_words:
                    if word in vlist:
                        vlist.remove(word)
                        print("c_remove ", word)
    return vlist
Beispiel #2
0
def CheckSamePhrases(voc_list):
    # Make Sure Which are the SamePhrases then record
    voc_list2 = []
    new_voclist = []
    for voc, cnt in voc_list:
        new_voclist.append(Phrase(voc))
        voc_list2.append(voc)
    print("voc_list")
    pprint(voc_list)
    newset = set(new_voclist)
    new_voclist = []
    for item in newset:
        new_voclist.append(item.strr)
    print("new_voclist")
    pprint(new_voclist)
    print(len(voc_list), len(new_voclist))

    SamePhrases = list(set(voc_list2) - set(new_voclist))
    print("SamePhrases", SamePhrases)
    SamePhraseRecords = {}
    for phrase in SamePhrases:
        for voc in voc_list2:
            if text_preprocess.PhraseReduction(
                    voc) == text_preprocess.PhraseReduction(
                        phrase) and voc not in SamePhrases:
                SamePhraseRecords[phrase] = voc
    print("SamePhraseRecord", SamePhraseRecords)
    new_list = []
    for voc, cnt in voc_list:
        if voc in SamePhraseRecords:
            new_list.append((SamePhraseRecords[voc], cnt))
        else:
            new_list.append((voc, cnt))

    new_list = [(word, sum(v for k, v in new_list if k == word))
                for word in dict(new_list).keys()]
    new_list = sorted(new_list, key=lambda tup: tup[1], reverse=True)
    return [SamePhrases, SamePhraseRecords, new_list]
Beispiel #3
0
def removeSubstring(
    vlist, TOP_N_CONCEPT
):  # e.g. if both "bubble" and "sort" and "bubble sort" exist in Top30 Words,remove"bubble" "sort"
    print("lenofvlist", len(vlist))
    print("vlist:")
    pprint(vlist[:TOP_N_CONCEPT])
    print("========================\n\n\n\n\n")

    checklist = vlist[:]
    for voctuple in vlist[:TOP_N_CONCEPT]:
        splitlist = voctuple[0].split(' ')
        if (len(splitlist) > 1):
            remove_words = []
            for item in splitlist:
                # t = [t for t in vlist[:TOP_N_CONCEPT] if t[0]==item]
                # t = [t for t in checklist if t[0]==item]
                t = [
                    t for t in checklist if text_preprocess.PhraseReduction(
                        t[0]) == text_preprocess.PhraseReduction(item)
                ]
                if (len(t) != 0):
                    remove_words.append(t[0])

            if (len(remove_words) == len(splitlist)
                ):  # all word have single voc corresponding
                for word in remove_words:
                    if word in vlist:
                        vlist.remove(word)
                        print("remove ", word)
                # if len(t)!=0:
                # vlist.remove(t[0])

    print("========================\n\n\n\n\n")
    print("lenofvlist", len(vlist))
    print("vlist:")
    pprint(vlist[:TOP_N_CONCEPT])
    return vlist
Beispiel #4
0
def SubPhrase(p1, p2):  # p1 is subphrase of p2
    if str(text_preprocess.PhraseReduction(p1)) in str(
            text_preprocess.PhraseReduction(p2)):
        return True
    else:
        return False
Beispiel #5
0
def SamePhrase(p1, p2):
    if str(text_preprocess.PhraseReduction(p1)) == str(
            text_preprocess.PhraseReduction(p2)):
        return True
    else:
        return False
Beispiel #6
0
        print(phrase, "/ same_phraselist:", same_phraselist)
        if (len(same_phraselist) > 1):  # there contain same phrases
            same_phraselist = sorted(same_phraselist,
                                     key=lambda tup: tup[1])[1:]
            print("sorted same_phraselist: ", same_phraselist)
            for t in same_phraselist:
                vlist.remove(t[0])
                print("remove!!!", t[0])
        # only 1 word and is substring of others ,remove this word (e.g."bubble" contained in "bubble sort")
        if len(phrase.split(' ')) == 1:
            similar_phraselist = [(t, vlist.index(t))
                                  for t in vlist[:TOP_N_CONCEPT]
                                  if SubPhrase(phrase, t[0]) == True]
            print(phrase, "/ similar_phraselist:", similar_phraselist)
            if (len(similar_phraselist) > 1):
                vlist.remove(vocitem)
        #         print("remove!!!",phrase)
    print("newvlist:", vlist[:TOP_N_CONCEPT])
    return vlist

    print("========================\n\n\n\n\n")
    print("lenofvlist", len(vlist))
    print("vlist:")
    pprint(vlist[:TOP_N_CONCEPT])
    return vlist


if __name__ == '__main__':
    phrase = "Sorting"
    print(text_preprocess.PhraseReduction(phrase))
Beispiel #7
0
 def __hash__(self):
     return hash(text_preprocess.PhraseReduction(self.strr))
Beispiel #8
0
 def __eq__(self, that):
     if text_preprocess.PhraseReduction(
             self.strr) != text_preprocess.PhraseReduction(that.strr):
         return False
     return text_preprocess.PhraseReduction(
         self.strr) == text_preprocess.PhraseReduction(that.strr)