def get_train_data(df, labels,
                   label_term_dict,
                   label_adult_dict,
                   label_actor_dict,
                   label_actress_dict,
                   label_producer_dict,
                   label_writer_dict,
                   label_director_dict,
                   label_composer_dict,
                   label_cinematographer_dict,
                   label_editor_dict,
                   label_prod_designer_dict,
                   label_dir_adult_dict,
                   label_dir_actor_dict,
                   label_dir_actress_dict,
                   label_dir_producer_dict,
                   label_dir_writer_dict,
                   label_dir_composer_dict,
                   label_dir_cinematographer_dict,
                   label_dir_editor_dict,
                   label_dir_prod_designer_dict,
                   label_actor_actress_dict, tokenizer, label_to_index, soft=False):
    y = []
    X = []
    y_true = []
    y_phrase = []
    y_metadata = []
    y_true_all = []
    y_pseudo_all = []
    index_word = {}
    for w in tokenizer.word_index:
        index_word[tokenizer.word_index[w]] = w
    for index, row in df.iterrows():
        line = row["text"]
        label = row["label"]
        tokens = tokenizer.texts_to_sequences([line])[0]
        words = []
        for tok in tokens:
            words.append(index_word[tok])
        count_dict = {}
        flag = 0
        l_phrase = get_phrase_label(words, label_term_dict, labels)
        l_metadata = get_metadata_label(label_adult_dict,
                                        label_actor_dict,
                                        label_actress_dict,
                                        label_producer_dict,
                                        label_writer_dict,
                                        label_director_dict,
                                        label_composer_dict,
                                        label_cinematographer_dict,
                                        label_editor_dict,
                                        label_prod_designer_dict,
                                        label_dir_adult_dict,
                                        label_dir_actor_dict,
                                        label_dir_actress_dict,
                                        label_dir_producer_dict,
                                        label_dir_writer_dict,
                                        label_dir_composer_dict,
                                        label_dir_cinematographer_dict,
                                        label_dir_editor_dict,
                                        label_dir_prod_designer_dict,
                                        label_actor_actress_dict, row, labels)
        y_phrase.append(l_phrase)
        y_metadata.append(l_metadata)
        for l in labels:
            seed_words = set(label_term_dict[l].keys())
            int_labels = list(set(words).intersection(seed_words))

            for word in words:
                if word in int_labels:
                    flag = 1
                    try:
                        temp = count_dict[l]
                    except:
                        count_dict[l] = {}
                    try:
                        count_dict[l][word] += label_term_dict[l][word]
                    except:
                        count_dict[l][word] = label_term_dict[l][word]

            count_dict, flag = get_int_adults(flag, count_dict, l, label_adult_dict, row)
            count_dict, flag = get_int_unigram(flag, count_dict, l, label_actor_dict, row, "actor")
            count_dict, flag = get_int_unigram(flag, count_dict, l, label_actress_dict, row, "actress")
            count_dict, flag = get_int_unigram(flag, count_dict, l, label_producer_dict, row, "producer")
            count_dict, flag = get_int_unigram(flag, count_dict, l, label_writer_dict, row, "writer")
            count_dict, flag = get_int_unigram(flag, count_dict, l, label_director_dict, row, "director")
            count_dict, flag = get_int_unigram(flag, count_dict, l, label_composer_dict, row, "composer")
            count_dict, flag = get_int_unigram(flag, count_dict, l, label_cinematographer_dict, row, "cinematographer")
            count_dict, flag = get_int_unigram(flag, count_dict, l, label_editor_dict, row, "editor")
            count_dict, flag = get_int_unigram(flag, count_dict, l, label_prod_designer_dict, row, "prod_designer")

            count_dict, flag = get_int_dir_adult(flag, count_dict, l, label_dir_adult_dict, row)
            count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_actor_dict, row, "director", "actor")
            count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_actress_dict, row, "director", "actress")
            count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_producer_dict, row, "director", "producer")
            count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_writer_dict, row, "director", "writer")
            count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_composer_dict, row, "director", "composer")
            count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_cinematographer_dict, row, "director",
                                              "cinematographer")
            count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_editor_dict, row, "director", "editor")
            count_dict, flag = get_int_bigram(flag, count_dict, l, label_dir_prod_designer_dict, row, "director",
                                              "prod_designer")
            count_dict, flag = get_int_bigram(flag, count_dict, l, label_actor_actress_dict, row, "actor", "actress")

        if flag:
            if not soft:
                lbl = argmax_label(count_dict)
                if not lbl:
                    continue
            else:
                lbl = softmax_label(count_dict, label_to_index)
            y.append(lbl)
            X.append(line)
            y_true.append(label)
            y_pseudo_all.append(lbl)
            y_true_all.append(label)
        else:
            y_pseudo_all.append(None)
            y_true_all.append(label)
    analyze(y_pseudo_all, y_phrase, y_metadata, y_true_all)
    return X, y, y_true
Ejemplo n.º 2
0
def get_train_data(df, labels, label_term_dict, label_author_dict, label_pub_dict, label_year_dict,
                   label_author_pub_dict, label_pub_year_dict, label_author_year_dict, tokenizer, label_to_index,
                   soft=False, clf="HAN"):
    y = []
    X = []
    y_true = []
    y_phrase = []
    y_metadata = []
    y_true_all = []
    y_pseudo_all = []
    index_word = {}
    for w in tokenizer.word_index:
        index_word[tokenizer.word_index[w]] = w
    for index, row in df.iterrows():
        authors_set = set(row["authors"])
        pub = row["publisher"]
        line = row["text"]
        label = row["label"]
        year = row["publication_year"]
        tokens = tokenizer.texts_to_sequences([line])[0]
        words = []
        for tok in tokens:
            words.append(index_word[tok])
        count_dict = {}
        flag = 0
        l_phrase = get_phrase_label(words, label_term_dict, labels)
        l_metadata = get_metadata_label(authors_set, label_author_dict, label_pub_dict, label_year_dict,
                                        label_author_pub_dict, label_pub_year_dict, label_author_year_dict, row, labels)
        y_phrase.append(l_phrase)
        y_metadata.append(l_metadata)
        for l in labels:
            seed_words = set(label_term_dict[l].keys())
            int_labels = list(set(words).intersection(seed_words))

            if len(label_author_dict) > 0:
                seed_authors = set(label_author_dict[l].keys())
                int_authors = authors_set.intersection(seed_authors)
            else:
                int_authors = []

            if len(label_pub_dict) and len(label_pub_dict[l]) > 0:
                seed_pubs = set(label_pub_dict[l].keys())
                if pub in seed_pubs:
                    try:
                        temp = count_dict[l]
                    except:
                        count_dict[l] = {}
                    count_dict[l]["PUB_" + str(pub)] = label_pub_dict[l][pub]
                    flag = 1

            if len(label_year_dict) and len(label_year_dict[l]) > 0:
                seed_years = set(label_year_dict[l].keys())
                if year in seed_years:
                    try:
                        temp = count_dict[l]
                    except:
                        count_dict[l] = {}
                    count_dict[l]["YEAR_" + str(year)] = label_year_dict[l][year]
                    flag = 1

            if len(label_pub_year_dict) and len(label_pub_year_dict[l]) > 0:
                seed_pub_years = set(label_pub_year_dict[l].keys())
                if (pub, year) in seed_pub_years:
                    try:
                        temp = count_dict[l]
                    except:
                        count_dict[l] = {}
                    count_dict[l]["PUB_YEAR_" + str((pub, year))] = label_pub_year_dict[l][(pub, year)]
                    flag = 1

            if len(label_author_pub_dict) > 0 and len(label_author_pub_dict[l]):
                seed_author_pubs = set(label_author_pub_dict[l].keys())
                row_auth_pubs = set()
                for auth in authors_set:
                    row_auth_pubs.add((auth, pub))
                int_auth_pubs = row_auth_pubs.intersection(seed_author_pubs)
            else:
                int_auth_pubs = []

            if len(label_author_year_dict) > 0 and len(label_author_year_dict[l]):
                seed_author_years = set(label_author_year_dict[l].keys())
                row_auth_years = set()
                for auth in authors_set:
                    row_auth_years.add((auth, year))
                int_auth_years = row_auth_years.intersection(seed_author_years)
            else:
                int_auth_years = []

            for word in words:
                if word in int_labels:
                    flag = 1
                    try:
                        temp = count_dict[l]
                    except:
                        count_dict[l] = {}
                    try:
                        count_dict[l][word] += label_term_dict[l][word]
                    except:
                        count_dict[l][word] = label_term_dict[l][word]

            for auth in int_authors:
                try:
                    temp = count_dict[l]
                except:
                    count_dict[l] = {}
                count_dict[l]["AUTH_" + str(auth)] = label_author_dict[l][auth]
                flag = 1

            for auth_pub in int_auth_pubs:
                try:
                    temp = count_dict[l]
                except:
                    count_dict[l] = {}
                count_dict[l]["AUTH_PUB_" + str(auth_pub)] = label_author_pub_dict[l][auth_pub]
                flag = 1

            for auth_year in int_auth_years:
                try:
                    temp = count_dict[l]
                except:
                    count_dict[l] = {}
                count_dict[l]["AUTH_YEAR_" + str(auth_year)] = label_author_year_dict[l][auth_year]
                flag = 1

        if flag:
            if not soft:
                lbl = argmax_label(count_dict)
                if not lbl:
                    continue
            else:
                lbl = softmax_label(count_dict, label_to_index)
            y.append(lbl)
            if clf == "BERT":
                X.append(index)
            else:
                X.append(line)
            y_true.append(label)
            y_pseudo_all.append(lbl)
            y_true_all.append(label)
        else:
            y_pseudo_all.append(None)
            y_true_all.append(label)
    analyze(y_pseudo_all, y_phrase, y_metadata, y_true_all)
    return X, y, y_true
Ejemplo n.º 3
0
def get_train_data(df, labels, label_term_dict, label_author_dict, label_conf_dict, tokenizer, label_to_index,
                   ignore_metadata=True, soft=False, clf="HAN"):
    y = []
    X = []
    y_true = []
    y_phrase = []
    y_metadata = []
    y_true_all = []
    y_pseudo_all = []
    index_word = {}
    for w in tokenizer.word_index:
        index_word[tokenizer.word_index[w]] = w
    for index, row in df.iterrows():
        auth_str = row["authors"]
        authors_set = set(auth_str.split(","))
        conf = row["conf"]
        line = row["abstract"]
        label = row["label"]
        tokens = tokenizer.texts_to_sequences([line])[0]
        words = []
        for tok in tokens:
            words.append(index_word[tok])
        count_dict = {}
        flag = 0
        l_phrase = get_phrase_label(words, label_term_dict, labels, label_to_index, soft=soft)
        l_metadata = get_metadata_label(authors_set, label_author_dict, conf, label_conf_dict, labels, label_to_index,
                                        soft=soft)
        y_phrase.append(l_phrase)
        y_metadata.append(l_metadata)
        for l in labels:
            seed_words = set(label_term_dict[l].keys())
            int_labels = list(set(words).intersection(seed_words))

            if len(label_author_dict) > 0:
                seed_authors = set(label_author_dict[l].keys())
                int_authors = authors_set.intersection(seed_authors)
            else:
                int_authors = []
            if ignore_metadata and len(int_labels) == 0:
                continue
            for word in words:
                if word in int_labels:
                    flag = 1
                    try:
                        temp = count_dict[l]
                    except:
                        count_dict[l] = {}
                    try:
                        count_dict[l][word] += label_term_dict[l][word]
                    except:
                        count_dict[l][word] = label_term_dict[l][word]

            # if flag:
            for auth in int_authors:
                try:
                    temp = count_dict[l]
                except:
                    count_dict[l] = {}
                count_dict[l]["AUTH_" + str(auth)] = label_author_dict[l][auth]
                flag = 1

            if len(label_conf_dict) and len(label_conf_dict[l]) > 0:
                seed_conf = set(label_conf_dict[l].keys())
                if conf in seed_conf:
                    try:
                        temp = count_dict[l]
                    except:
                        count_dict[l] = {}
                    count_dict[l]["CONF_" + str(conf)] = label_conf_dict[l][conf]
                    flag = 1

        if flag:
            if not soft:
                lbl = argmax_label(count_dict)
                if not lbl:
                    continue
            else:
                lbl = softmax_label(count_dict, label_to_index)
            y.append(lbl)
            if clf == "BERT":
                X.append(index)
            else:
                X.append(line)
            y_true.append(label)
            y_pseudo_all.append(lbl)
            y_true_all.append(label)
        else:
            y_pseudo_all.append(None)
            y_true_all.append(label)
    analyze(y_pseudo_all, y_phrase, y_metadata, y_true_all)
    return X, y, y_true
Ejemplo n.º 4
0
def get_train_data(df,
                   labels,
                   label_term_dict,
                   label_author_dict,
                   label_attr_dict,
                   label_author_attr_dict,
                   tokenizer,
                   label_to_index,
                   ignore_metadata=True,
                   soft=False):
    y = []
    X = []
    y_true = []
    y_phrase = []
    y_metadata = []
    y_true_all = []
    y_pseudo_all = []
    index_word = {}
    for w in tokenizer.word_index:
        index_word[tokenizer.word_index[w]] = w
    for index, row in df.iterrows():
        authors_set = set(row["Users"])
        line = row["Review"]
        label = row["label"]
        tokens = tokenizer.texts_to_sequences([line])[0]
        words = []
        for tok in tokens:
            words.append(index_word[tok])
        count_dict = {}
        flag = 0
        l_phrase = get_phrase_label(words, label_term_dict, labels)
        l_metadata = get_metadata_label(authors_set, label_author_dict,
                                        label_attr_dict,
                                        label_author_attr_dict, row, labels)
        y_phrase.append(l_phrase)
        y_metadata.append(l_metadata)
        for l in labels:
            seed_words = set(label_term_dict[l].keys())
            int_labels = list(set(words).intersection(seed_words))

            if len(label_author_dict) > 0:
                seed_authors = set(label_author_dict[l].keys())
                int_authors = authors_set.intersection(seed_authors)
            else:
                int_authors = []

            if len(label_attr_dict) > 0:
                seed_attrs = set(label_attr_dict[l].keys())
                row_attrs = get_all_keys(row)
                int_attrs = row_attrs.intersection(seed_attrs)
            else:
                int_attrs = []

            if len(label_author_attr_dict) > 0:
                seed_author_attrs = set(label_author_attr_dict[l].keys())
                row_attrs = get_all_keys(row)
                row_auth_attrs = set()
                for aut in authors_set:
                    for attr in row_attrs:
                        row_auth_attrs.add((aut, attr))
                int_auth_attrs = row_auth_attrs.intersection(seed_author_attrs)
            else:
                int_auth_attrs = []

            if ignore_metadata and len(int_labels) == 0:
                continue
            for word in words:
                if word in int_labels:
                    flag = 1
                    try:
                        temp = count_dict[l]
                    except:
                        count_dict[l] = {}
                    try:
                        count_dict[l][word] += label_term_dict[l][word]
                    except:
                        count_dict[l][word] = label_term_dict[l][word]

            # if flag:
            for auth in int_authors:
                try:
                    temp = count_dict[l]
                except:
                    count_dict[l] = {}
                count_dict[l]["AUTH_" + str(auth)] = label_author_dict[l][auth]
                flag = 1

            for attr in int_attrs:
                try:
                    temp = count_dict[l]
                except:
                    count_dict[l] = {}
                count_dict[l]["ATTR_" + str(attr)] = label_attr_dict[l][attr]
                flag = 1

            for auth_attr in int_auth_attrs:
                try:
                    temp = count_dict[l]
                except:
                    count_dict[l] = {}
                count_dict[l][
                    "AUTH_ATTR_" +
                    str(auth_attr)] = label_author_attr_dict[l][auth_attr]
                flag = 1

        if flag:
            if not soft:
                lbl = argmax_label(count_dict)
                if not lbl:
                    continue
            else:
                lbl = softmax_label(count_dict, label_to_index)
            y.append(lbl)
            X.append(line)
            y_true.append(label)
            y_pseudo_all.append(lbl)
            y_true_all.append(label)
        else:
            y_pseudo_all.append(None)
            y_true_all.append(label)
    analyze(y_pseudo_all, y_phrase, y_metadata, y_true_all)
    return X, y, y_true
def get_train_data(df,
                   labels,
                   label_term_dict,
                   label_user_dict,
                   label_tag_dict,
                   label_user_tag_dict,
                   tokenizer,
                   label_to_index,
                   soft=False):
    y = []
    X = []
    y_true = []
    y_phrase = []
    y_metadata = []
    y_true_all = []
    y_pseudo_all = []
    index_word = {}
    for w in tokenizer.word_index:
        index_word[tokenizer.word_index[w]] = w
    for index, row in df.iterrows():
        tags_set = set(row["tags"])
        user = row["user"]
        line = row["text"]
        label = row["label"]
        tokens = tokenizer.texts_to_sequences([line])[0]
        words = []
        for tok in tokens:
            words.append(index_word[tok])
        count_dict = {}
        flag = 0
        l_phrase = get_phrase_label(words, label_term_dict, labels)
        l_metadata = get_metadata_label(tags_set, label_user_dict,
                                        label_tag_dict, label_user_tag_dict,
                                        row, labels)
        y_phrase.append(l_phrase)
        y_metadata.append(l_metadata)
        for l in labels:
            seed_words = set(label_term_dict[l].keys())
            int_labels = list(set(words).intersection(seed_words))

            if len(label_tag_dict) > 0:
                seed_tags = set(label_tag_dict[l].keys())
                int_tags = tags_set.intersection(seed_tags)
            else:
                int_tags = []

            if len(label_user_dict) and len(label_user_dict[l]) > 0:
                seed_users = set(label_user_dict[l].keys())
                if user in seed_users:
                    try:
                        temp = count_dict[l]
                    except:
                        count_dict[l] = {}
                    count_dict[l]["USER_" +
                                  str(user)] = label_user_dict[l][user]
                    flag = 1

            if len(label_user_tag_dict) > 0 and len(label_user_tag_dict[l]):
                seed_user_tags = set(label_user_tag_dict[l].keys())
                row_user_tags = set()
                for tag in tags_set:
                    row_user_tags.add((user, tag))
                int_user_tags = row_user_tags.intersection(seed_user_tags)
            else:
                int_user_tags = []

            for word in words:
                if word in int_labels:
                    flag = 1
                    try:
                        temp = count_dict[l]
                    except:
                        count_dict[l] = {}
                    try:
                        count_dict[l][word] += label_term_dict[l][word]
                    except:
                        count_dict[l][word] = label_term_dict[l][word]

            for tag in int_tags:
                try:
                    temp = count_dict[l]
                except:
                    count_dict[l] = {}
                count_dict[l]["TAG_" + str(tag)] = label_tag_dict[l][tag]
                flag = 1

            for user_tag in int_user_tags:
                try:
                    temp = count_dict[l]
                except:
                    count_dict[l] = {}
                count_dict[l]["USER_TAG_" +
                              str(user_tag)] = label_user_tag_dict[l][user_tag]
                flag = 1

        if flag:
            if not soft:
                lbl = argmax_label(count_dict)
                if not lbl:
                    continue
            else:
                lbl = softmax_label(count_dict, label_to_index)
            y.append(lbl)
            X.append(line)
            y_true.append(label)
            y_pseudo_all.append(lbl)
            y_true_all.append(label)
        else:
            y_pseudo_all.append(None)
            y_true_all.append(label)
    analyze(y_pseudo_all, y_phrase, y_metadata, y_true_all)
    return X, y, y_true