def detect_phrase(sentence, tokenizer, index_word, id_phrase_map, idx):
    tokens = tokenizer.texts_to_sequences([sentence])
    temp = []
    for tok in tokens[0]:
        try:
            id = decrypt(index_word[tok])
            if id == None or id not in id_phrase_map:
                if index_word[tok].startswith("fnust"):
                    num_str = index_word[tok][5:]
                    flag = 0
                    for index, char in enumerate(num_str):
                        if index >= 5:
                            break
                        try:
                            temp_int = int(char)
                            flag = 1
                        except:
                            break
                    if flag == 1:
                        if int(num_str[:index]) in id_phrase_map:
                            temp.append(index_word[tok])
                    else:
                        print(idx, index_word[tok])
            else:
                temp.append(index_word[tok])
        except Exception as e:
            pass
    return temp
def decipher_phrase(word, id_phrase_map):
    id = decrypt(word)
    if id == None or id not in id_phrase_map:
        if word.startswith("fnust"):
            num_str = word[5:]
            flag = 0
            for index, char in enumerate(num_str):
                if index >= 5:
                    break
                try:
                    temp_int = int(char)
                    flag = 1
                except:
                    break
            if flag == 1:
                if int(num_str[:index]) in id_phrase_map:
                    print("I am here")
                    return id_phrase_map[int(num_str[:index])]
            else:
                raise ValueError("Something unexpected found: ", word)
        else:
            return word
    else:
        return id_phrase_map[id]
    return word
Example #3
0
def print_label_phrase_dict(label_phrase_dict, id_phrase_map):
    for label in label_phrase_dict:
        print(label)
        print("*" * 80)
        print("Number of phrases: ", len(label_phrase_dict[label]))
        for key in label_phrase_dict[label]:
            id = decrypt(key)
            print(id_phrase_map[id], label_phrase_dict[label][key])
def conwea_seeds(df, id_phrase_map, topk=100):
    docfreq = calculate_df_doc_freq(df)
    inv_docfreq = calculate_inv_doc_freq(df, docfreq)
    label_docs_dict = get_label_docs_dict(df)
    components = get_rank_matrix(docfreq,
                                 inv_docfreq,
                                 label_docs_dict,
                                 doc_freq_thresh=5)
    all_seeds = {}
    topk_words = {}
    topk_phrases = {}

    for l in label_docs_dict:
        components[l] = {
            k: v
            for k, v in sorted(components[l].items(),
                               key=lambda x: -x[1]["rank"])
        }

        topk_words[l] = {}
        for k, v in list(components[l].items()):
            if len(topk_words[l]) >= topk:
                break
            if decrypt(k) is None:
                topk_words[l][k] = v["rank"]

        topk_phrases[l] = {}
        for k, v in list(components[l].items()):
            if len(topk_phrases[l]) >= topk:
                break
            if decrypt(k) is not None:
                topk_phrases[l][decipher_phrase(k, id_phrase_map)] = v["rank"]

        all_seeds[l] = {k: v["rank"] for k, v in components[l].items()}
    json.dump(topk_words, open(data_path + "conwea_top100words.json", "w"))
    json.dump(topk_phrases, open(data_path + "conwea_top100phrases.json", "w"))
    json.dump(all_seeds,
              open(data_path + "conwea_seeds_words_phrases.json", "w"))
Example #5
0
def print_label_term_dict(label_term_dict, components, id_phrase_map):
    for label in label_term_dict:
        print(label)
        print("*" * 80)
        for val in label_term_dict[label]:
            try:
                id = decrypt(val)
                if id is not None and id in id_phrase_map:
                    phrase = id_phrase_map[id]
                    print(phrase, components[label][val])
                else:
                    print(val, components[label][val])
            except Exception as e:
                print("Exception occured: ", e, val)