def detect_phrase(sentence, tokenizer, index_word, id_phrase_map, idx): tokens = tokenizer.texts_to_sequences([sentence]) temp = [] for tok in tokens[0]: try: id = decrypt(index_word[tok]) if id == None or id not in id_phrase_map: if index_word[tok].startswith("fnust"): num_str = index_word[tok][5:] flag = 0 for index, char in enumerate(num_str): if index >= 5: break try: temp_int = int(char) flag = 1 except: break if flag == 1: if int(num_str[:index]) in id_phrase_map: temp.append(index_word[tok]) else: print(idx, index_word[tok]) else: temp.append(index_word[tok]) except Exception as e: pass return temp
def decipher_phrase(word, id_phrase_map): id = decrypt(word) if id == None or id not in id_phrase_map: if word.startswith("fnust"): num_str = word[5:] flag = 0 for index, char in enumerate(num_str): if index >= 5: break try: temp_int = int(char) flag = 1 except: break if flag == 1: if int(num_str[:index]) in id_phrase_map: print("I am here") return id_phrase_map[int(num_str[:index])] else: raise ValueError("Something unexpected found: ", word) else: return word else: return id_phrase_map[id] return word
def print_label_phrase_dict(label_phrase_dict, id_phrase_map): for label in label_phrase_dict: print(label) print("*" * 80) print("Number of phrases: ", len(label_phrase_dict[label])) for key in label_phrase_dict[label]: id = decrypt(key) print(id_phrase_map[id], label_phrase_dict[label][key])
def conwea_seeds(df, id_phrase_map, topk=100): docfreq = calculate_df_doc_freq(df) inv_docfreq = calculate_inv_doc_freq(df, docfreq) label_docs_dict = get_label_docs_dict(df) components = get_rank_matrix(docfreq, inv_docfreq, label_docs_dict, doc_freq_thresh=5) all_seeds = {} topk_words = {} topk_phrases = {} for l in label_docs_dict: components[l] = { k: v for k, v in sorted(components[l].items(), key=lambda x: -x[1]["rank"]) } topk_words[l] = {} for k, v in list(components[l].items()): if len(topk_words[l]) >= topk: break if decrypt(k) is None: topk_words[l][k] = v["rank"] topk_phrases[l] = {} for k, v in list(components[l].items()): if len(topk_phrases[l]) >= topk: break if decrypt(k) is not None: topk_phrases[l][decipher_phrase(k, id_phrase_map)] = v["rank"] all_seeds[l] = {k: v["rank"] for k, v in components[l].items()} json.dump(topk_words, open(data_path + "conwea_top100words.json", "w")) json.dump(topk_phrases, open(data_path + "conwea_top100phrases.json", "w")) json.dump(all_seeds, open(data_path + "conwea_seeds_words_phrases.json", "w"))
def print_label_term_dict(label_term_dict, components, id_phrase_map): for label in label_term_dict: print(label) print("*" * 80) for val in label_term_dict[label]: try: id = decrypt(val) if id is not None and id in id_phrase_map: phrase = id_phrase_map[id] print(phrase, components[label][val]) else: print(val, components[label][val]) except Exception as e: print("Exception occured: ", e, val)