def get_train_data(df, labels, label_term_dict, label_adult_dict, label_actor_dict, label_actress_dict, label_producer_dict, label_writer_dict, label_director_dict, label_composer_dict, label_cinematographer_dict, label_editor_dict, label_prod_designer_dict, label_dir_adult_dict, label_dir_actor_dict, label_dir_actress_dict, label_dir_producer_dict, label_dir_writer_dict, label_dir_composer_dict, label_dir_cinematographer_dict, label_dir_editor_dict, label_dir_prod_designer_dict, label_actor_actress_dict, tokenizer, label_to_index, soft=False): y = [] X = [] y_true = [] y_phrase = [] y_metadata = [] y_true_all = [] y_pseudo_all = [] index_word = {} for w in tokenizer.word_index: index_word[tokenizer.word_index[w]] = w for index, row in df.iterrows(): line = row["text"] label = row["label"] tokens = tokenizer.texts_to_sequences([line])[0] words = [] for tok in tokens: words.append(index_word[tok]) count_dict = {} flag = 0 l_phrase = get_phrase_label(words, label_term_dict, labels) l_metadata = get_metadata_label(label_adult_dict, label_actor_dict, label_actress_dict, label_producer_dict, label_writer_dict, label_director_dict, label_composer_dict, label_cinematographer_dict, label_editor_dict, label_prod_designer_dict, label_dir_adult_dict, label_dir_actor_dict, label_dir_actress_dict, label_dir_producer_dict, label_dir_writer_dict, label_dir_composer_dict, label_dir_cinematographer_dict, label_dir_editor_dict, label_dir_prod_designer_dict, label_actor_actress_dict, row, labels) y_phrase.append(l_phrase) y_metadata.append(l_metadata) for l in labels: seed_words = set(label_term_dict[l].keys()) int_labels = list(set(words).intersection(seed_words)) for word in words: if word in int_labels: flag = 1 try: temp = count_dict[l] except: count_dict[l] = {} try: count_dict[l][word] += label_term_dict[l][word] except: count_dict[l][word] = label_term_dict[l][word] count_dict, flag = get_int_adults(flag, count_dict, l, label_adult_dict, row) count_dict, flag = get_int_unigram(flag, count_dict, l, label_actor_dict, row, "actor") count_dict, flag = get_int_unigram(flag, count_dict, l, label_actress_dict, row, "actress") count_dict, flag = get_int_unigram(flag, count_dict, l, label_producer_dict, row, "producer") count_dict, flag = get_int_unigram(flag, count_dict, l, label_writer_dict, row, "writer") count_dict, flag = get_int_unigram(flag, count_dict, l, label_director_dict, row, "director") count_dict, flag = get_int_unigram(flag, count_dict, l, label_composer_dict, row, "composer") count_dict, flag = get_int_unigram(flag, count_dict, l, label_cinematographer_dict, row, "cinematographer") count_dict, flag = get_int_unigram(flag, count_dict, l, label_editor_dict, row, "editor") count_dict, flag = get_int_unigram(flag, count_dict, l, label_prod_designer_dict, row, "prod_designer") count_dict, flag = get_int_dir_adult(flag, count_dict, l, label_dir_adult_dict, row) count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_actor_dict, row, "director", "actor") count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_actress_dict, row, "director", "actress") count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_producer_dict, row, "director", "producer") count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_writer_dict, row, "director", "writer") count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_composer_dict, row, "director", "composer") count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_cinematographer_dict, row, "director", "cinematographer") count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_editor_dict, row, "director", "editor") count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_prod_designer_dict, row, "director", "prod_designer") count_dict, flag = get_int_bigram(flag, count_dict, l, label_actor_actress_dict, row, "actor", "actress") if flag: if not soft: lbl = argmax_label(count_dict) if not lbl: continue else: lbl = softmax_label(count_dict, label_to_index) y.append(lbl) X.append(line) y_true.append(label) y_pseudo_all.append(lbl) y_true_all.append(label) else: y_pseudo_all.append(None) y_true_all.append(label) analyze(y_pseudo_all, y_phrase, y_metadata, y_true_all) return X, y, y_true
def get_train_data(df, labels, label_term_dict, label_author_dict, label_pub_dict, label_year_dict, label_author_pub_dict, label_pub_year_dict, label_author_year_dict, tokenizer, label_to_index, soft=False, clf="HAN"): y = [] X = [] y_true = [] y_phrase = [] y_metadata = [] y_true_all = [] y_pseudo_all = [] index_word = {} for w in tokenizer.word_index: index_word[tokenizer.word_index[w]] = w for index, row in df.iterrows(): authors_set = set(row["authors"]) pub = row["publisher"] line = row["text"] label = row["label"] year = row["publication_year"] tokens = tokenizer.texts_to_sequences([line])[0] words = [] for tok in tokens: words.append(index_word[tok]) count_dict = {} flag = 0 l_phrase = get_phrase_label(words, label_term_dict, labels) l_metadata = get_metadata_label(authors_set, label_author_dict, label_pub_dict, label_year_dict, label_author_pub_dict, label_pub_year_dict, label_author_year_dict, row, labels) y_phrase.append(l_phrase) y_metadata.append(l_metadata) for l in labels: seed_words = set(label_term_dict[l].keys()) int_labels = list(set(words).intersection(seed_words)) if len(label_author_dict) > 0: seed_authors = set(label_author_dict[l].keys()) int_authors = authors_set.intersection(seed_authors) else: int_authors = [] if len(label_pub_dict) and len(label_pub_dict[l]) > 0: seed_pubs = set(label_pub_dict[l].keys()) if pub in seed_pubs: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["PUB_" + str(pub)] = label_pub_dict[l][pub] flag = 1 if len(label_year_dict) and len(label_year_dict[l]) > 0: seed_years = set(label_year_dict[l].keys()) if year in seed_years: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["YEAR_" + str(year)] = label_year_dict[l][year] flag = 1 if len(label_pub_year_dict) and len(label_pub_year_dict[l]) > 0: seed_pub_years = set(label_pub_year_dict[l].keys()) if (pub, year) in seed_pub_years: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["PUB_YEAR_" + str((pub, year))] = label_pub_year_dict[l][(pub, year)] flag = 1 if len(label_author_pub_dict) > 0 and len(label_author_pub_dict[l]): seed_author_pubs = set(label_author_pub_dict[l].keys()) row_auth_pubs = set() for auth in authors_set: row_auth_pubs.add((auth, pub)) int_auth_pubs = row_auth_pubs.intersection(seed_author_pubs) else: int_auth_pubs = [] if len(label_author_year_dict) > 0 and len(label_author_year_dict[l]): seed_author_years = set(label_author_year_dict[l].keys()) row_auth_years = set() for auth in authors_set: row_auth_years.add((auth, year)) int_auth_years = row_auth_years.intersection(seed_author_years) else: int_auth_years = [] for word in words: if word in int_labels: flag = 1 try: temp = count_dict[l] except: count_dict[l] = {} try: count_dict[l][word] += label_term_dict[l][word] except: count_dict[l][word] = label_term_dict[l][word] for auth in int_authors: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["AUTH_" + str(auth)] = label_author_dict[l][auth] flag = 1 for auth_pub in int_auth_pubs: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["AUTH_PUB_" + str(auth_pub)] = label_author_pub_dict[l][auth_pub] flag = 1 for auth_year in int_auth_years: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["AUTH_YEAR_" + str(auth_year)] = label_author_year_dict[l][auth_year] flag = 1 if flag: if not soft: lbl = argmax_label(count_dict) if not lbl: continue else: lbl = softmax_label(count_dict, label_to_index) y.append(lbl) if clf == "BERT": X.append(index) else: X.append(line) y_true.append(label) y_pseudo_all.append(lbl) y_true_all.append(label) else: y_pseudo_all.append(None) y_true_all.append(label) analyze(y_pseudo_all, y_phrase, y_metadata, y_true_all) return X, y, y_true
def get_train_data(df, labels, label_term_dict, label_author_dict, label_conf_dict, tokenizer, label_to_index, ignore_metadata=True, soft=False, clf="HAN"): y = [] X = [] y_true = [] y_phrase = [] y_metadata = [] y_true_all = [] y_pseudo_all = [] index_word = {} for w in tokenizer.word_index: index_word[tokenizer.word_index[w]] = w for index, row in df.iterrows(): auth_str = row["authors"] authors_set = set(auth_str.split(",")) conf = row["conf"] line = row["abstract"] label = row["label"] tokens = tokenizer.texts_to_sequences([line])[0] words = [] for tok in tokens: words.append(index_word[tok]) count_dict = {} flag = 0 l_phrase = get_phrase_label(words, label_term_dict, labels, label_to_index, soft=soft) l_metadata = get_metadata_label(authors_set, label_author_dict, conf, label_conf_dict, labels, label_to_index, soft=soft) y_phrase.append(l_phrase) y_metadata.append(l_metadata) for l in labels: seed_words = set(label_term_dict[l].keys()) int_labels = list(set(words).intersection(seed_words)) if len(label_author_dict) > 0: seed_authors = set(label_author_dict[l].keys()) int_authors = authors_set.intersection(seed_authors) else: int_authors = [] if ignore_metadata and len(int_labels) == 0: continue for word in words: if word in int_labels: flag = 1 try: temp = count_dict[l] except: count_dict[l] = {} try: count_dict[l][word] += label_term_dict[l][word] except: count_dict[l][word] = label_term_dict[l][word] # if flag: for auth in int_authors: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["AUTH_" + str(auth)] = label_author_dict[l][auth] flag = 1 if len(label_conf_dict) and len(label_conf_dict[l]) > 0: seed_conf = set(label_conf_dict[l].keys()) if conf in seed_conf: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["CONF_" + str(conf)] = label_conf_dict[l][conf] flag = 1 if flag: if not soft: lbl = argmax_label(count_dict) if not lbl: continue else: lbl = softmax_label(count_dict, label_to_index) y.append(lbl) if clf == "BERT": X.append(index) else: X.append(line) y_true.append(label) y_pseudo_all.append(lbl) y_true_all.append(label) else: y_pseudo_all.append(None) y_true_all.append(label) analyze(y_pseudo_all, y_phrase, y_metadata, y_true_all) return X, y, y_true
def get_train_data(df, labels, label_term_dict, label_author_dict, label_attr_dict, label_author_attr_dict, tokenizer, label_to_index, ignore_metadata=True, soft=False): y = [] X = [] y_true = [] y_phrase = [] y_metadata = [] y_true_all = [] y_pseudo_all = [] index_word = {} for w in tokenizer.word_index: index_word[tokenizer.word_index[w]] = w for index, row in df.iterrows(): authors_set = set(row["Users"]) line = row["Review"] label = row["label"] tokens = tokenizer.texts_to_sequences([line])[0] words = [] for tok in tokens: words.append(index_word[tok]) count_dict = {} flag = 0 l_phrase = get_phrase_label(words, label_term_dict, labels) l_metadata = get_metadata_label(authors_set, label_author_dict, label_attr_dict, label_author_attr_dict, row, labels) y_phrase.append(l_phrase) y_metadata.append(l_metadata) for l in labels: seed_words = set(label_term_dict[l].keys()) int_labels = list(set(words).intersection(seed_words)) if len(label_author_dict) > 0: seed_authors = set(label_author_dict[l].keys()) int_authors = authors_set.intersection(seed_authors) else: int_authors = [] if len(label_attr_dict) > 0: seed_attrs = set(label_attr_dict[l].keys()) row_attrs = get_all_keys(row) int_attrs = row_attrs.intersection(seed_attrs) else: int_attrs = [] if len(label_author_attr_dict) > 0: seed_author_attrs = set(label_author_attr_dict[l].keys()) row_attrs = get_all_keys(row) row_auth_attrs = set() for aut in authors_set: for attr in row_attrs: row_auth_attrs.add((aut, attr)) int_auth_attrs = row_auth_attrs.intersection(seed_author_attrs) else: int_auth_attrs = [] if ignore_metadata and len(int_labels) == 0: continue for word in words: if word in int_labels: flag = 1 try: temp = count_dict[l] except: count_dict[l] = {} try: count_dict[l][word] += label_term_dict[l][word] except: count_dict[l][word] = label_term_dict[l][word] # if flag: for auth in int_authors: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["AUTH_" + str(auth)] = label_author_dict[l][auth] flag = 1 for attr in int_attrs: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["ATTR_" + str(attr)] = label_attr_dict[l][attr] flag = 1 for auth_attr in int_auth_attrs: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l][ "AUTH_ATTR_" + str(auth_attr)] = label_author_attr_dict[l][auth_attr] flag = 1 if flag: if not soft: lbl = argmax_label(count_dict) if not lbl: continue else: lbl = softmax_label(count_dict, label_to_index) y.append(lbl) X.append(line) y_true.append(label) y_pseudo_all.append(lbl) y_true_all.append(label) else: y_pseudo_all.append(None) y_true_all.append(label) analyze(y_pseudo_all, y_phrase, y_metadata, y_true_all) return X, y, y_true
def get_train_data(df, labels, label_term_dict, label_user_dict, label_tag_dict, label_user_tag_dict, tokenizer, label_to_index, soft=False): y = [] X = [] y_true = [] y_phrase = [] y_metadata = [] y_true_all = [] y_pseudo_all = [] index_word = {} for w in tokenizer.word_index: index_word[tokenizer.word_index[w]] = w for index, row in df.iterrows(): tags_set = set(row["tags"]) user = row["user"] line = row["text"] label = row["label"] tokens = tokenizer.texts_to_sequences([line])[0] words = [] for tok in tokens: words.append(index_word[tok]) count_dict = {} flag = 0 l_phrase = get_phrase_label(words, label_term_dict, labels) l_metadata = get_metadata_label(tags_set, label_user_dict, label_tag_dict, label_user_tag_dict, row, labels) y_phrase.append(l_phrase) y_metadata.append(l_metadata) for l in labels: seed_words = set(label_term_dict[l].keys()) int_labels = list(set(words).intersection(seed_words)) if len(label_tag_dict) > 0: seed_tags = set(label_tag_dict[l].keys()) int_tags = tags_set.intersection(seed_tags) else: int_tags = [] if len(label_user_dict) and len(label_user_dict[l]) > 0: seed_users = set(label_user_dict[l].keys()) if user in seed_users: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["USER_" + str(user)] = label_user_dict[l][user] flag = 1 if len(label_user_tag_dict) > 0 and len(label_user_tag_dict[l]): seed_user_tags = set(label_user_tag_dict[l].keys()) row_user_tags = set() for tag in tags_set: row_user_tags.add((user, tag)) int_user_tags = row_user_tags.intersection(seed_user_tags) else: int_user_tags = [] for word in words: if word in int_labels: flag = 1 try: temp = count_dict[l] except: count_dict[l] = {} try: count_dict[l][word] += label_term_dict[l][word] except: count_dict[l][word] = label_term_dict[l][word] for tag in int_tags: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["TAG_" + str(tag)] = label_tag_dict[l][tag] flag = 1 for user_tag in int_user_tags: try: temp = count_dict[l] except: count_dict[l] = {} count_dict[l]["USER_TAG_" + str(user_tag)] = label_user_tag_dict[l][user_tag] flag = 1 if flag: if not soft: lbl = argmax_label(count_dict) if not lbl: continue else: lbl = softmax_label(count_dict, label_to_index) y.append(lbl) X.append(line) y_true.append(label) y_pseudo_all.append(lbl) y_true_all.append(label) else: y_pseudo_all.append(None) y_true_all.append(label) analyze(y_pseudo_all, y_phrase, y_metadata, y_true_all) return X, y, y_true