Beispiel #1
0
            l = line.split("\t")
            target_id = l[0]
            source_id = l[1]
            intent = l[2]
            left_citated_text = l[3]
            right_citated_text = l[4]
            left_citation_tokenized = tokenizer.tokenize(
                left_citated_text)[-250:]
            right_citation_tokenized = tokenizer.tokenize(
                right_citated_text)[:250]
            input_tokens = tokenizer.convert_tokens_to_ids(
                left_citation_tokenized) + [
                    tokenizer.sep_token_id
                ] + tokenizer.convert_tokens_to_ids(right_citation_tokenized)
            position_citation_mark = len(left_citation_tokenized)
            tokens_tensor = torch.tensor([input_tokens])
            outputs = model(tokens_tensor)
            emb = np.array(outputs[0][0][position_citation_mark].cpu())
            if intent not in intentdict:
                intentdict[intent] = intentn
                intentn += 1
            X.append(emb)
            y.append(intentdict[intent])
    return X, y


if __name__ == "__main__":
    ent_vocab = build_ent_vocab(
        "/home/ohagi_masaya/TransBasedCitEmb/dataset/AASC/train.csv")
    load_data_SVM_with_context(ent_vocab)
Beispiel #2
0
def load_AASC_graph_data(args):
    def extract_by_frequency(path_train, path_test, frequency):
        dftrain = pd.read_csv(path_train, quotechar="'")
        dftest = pd.read_csv(path_test, quotechar="'")
        source_cut_train = dftrain[[
            'target_id', 'source_id'
        ]].drop_duplicates(subset=['target_id', 'source_id'])
        source_cut_test = dftest[[
            'target_id', 'source_id'
        ]].drop_duplicates(subset=['target_id', 'source_id'])
        ftrain_fre = open(
            path_train[:-4] + "_frequency" + str(frequency) + ".csv", "w")
        ftest_fre = open(
            path_test[:-4] + "_frequency" + str(frequency) + ".csv", "w")
        wtrain = csv.writer(ftrain_fre, quotechar="'")
        wtest = csv.writer(ftest_fre, quotechar="'")
        wtrain.writerow([
            "target_id", "left_citated_text", "right_citated_text", "source_id"
        ])
        wtest.writerow([
            "target_id", "left_citated_text", "right_citated_text", "source_id"
        ])
        source_train_keys = source_cut_train.source_id.value_counts().keys()
        source_test_keys = source_cut_test.source_id.value_counts().keys()
        dic1 = {}
        train_counts = source_cut_train.source_id.value_counts()
        test_counts = source_cut_test.source_id.value_counts()
        for key in source_train_keys:
            dic1[key] = train_counts[key]
        for key in source_test_keys:
            if key in dic1:
                dic1[key] += test_counts[key]
            else:
                dic1[key] = test_counts[key]
        frequencylist = []
        for key in dic1:
            if dic1[key] >= frequency:
                frequencylist.append(key)
        dftrain = dftrain.loc[dftrain["source_id"].isin(frequencylist)]
        dftest = dftest.loc[dftest["source_id"].isin(frequencylist)]
        for target_id, left_citated_text, right_citated_text, source_id in zip(
                dftrain["target_id"], dftrain["left_citated_text"],
                dftrain["right_citated_text"], dftrain["source_id"]):
            wtrain.writerow(
                [target_id, left_citated_text, right_citated_text, source_id])
        ftrain_fre.close()
        for target_id, left_citated_text, right_citated_text, source_id in zip(
                dftest["target_id"], dftest["left_citated_text"],
                dftest["right_citated_text"], dftest["source_id"]):
            wtest.writerow(
                [target_id, left_citated_text, right_citated_text, source_id])
        ftest_fre.close()
        entitylist = list(
            set(
                list(dftrain["source_id"].values) +
                list(dftrain["target_id"].values) +
                list(dftest["source_id"].values) +
                list(dftest["target_id"].values)))
        entitylist.sort()
        ent_vocab = {"UNKNOWN": 0, "MASK": 1}
        for i, entity in enumerate(entitylist):
            ent_vocab[entity] = i + 2
        return path_train[:-4] + "_frequency" + str(
            frequency) + ".csv", path_test[:-4] + "_frequency" + str(
                frequency) + ".csv", ent_vocab

    path = settings.citation_recommendation_dir
    path_train = os.path.join(path, "train.csv")
    path_test = os.path.join(path, "test.csv")
    ent_vocab = build_ent_vocab(path_train)
    path_train_frequency5, path_test_frequency5, ent_vocab_frequency5 = extract_by_frequency(
        path_train, path_test, args.frequency)
    #randomでMASKするように一旦変更
    datasetdict = {
        "tail": AASCDataSet,
        "random": AASCDataSetRANDOM,
        "both": AASCDataSetBOTH
    }
    cur_dataset = datasetdict[args.mask_type]
    if args.train_data == "full":
        dataset_train = cur_dataset(path_train,
                                    ent_vocab=ent_vocab,
                                    WINDOW_SIZE=args.WINDOW_SIZE,
                                    MAX_LEN=args.MAX_LEN,
                                    pretrained_model=args.pretrained_model,
                                    mode="train")
    else:
        dataset_train = cur_dataset(path_train_frequency5,
                                    ent_vocab=ent_vocab,
                                    WINDOW_SIZE=args.WINDOW_SIZE,
                                    MAX_LEN=args.MAX_LEN,
                                    pretrained_model=args.pretrained_model,
                                    mode="train")
    if args.test_data == "full":
        dataset_test = cur_dataset(path_test,
                                   ent_vocab=ent_vocab,
                                   WINDOW_SIZE=args.WINDOW_SIZE,
                                   MAX_LEN=args.MAX_LEN,
                                   pretrained_model=args.pretrained_model,
                                   mode="test")
    else:
        dataset_test = cur_dataset(path_test_frequency5,
                                   ent_vocab=ent_vocab,
                                   WINDOW_SIZE=args.WINDOW_SIZE,
                                   MAX_LEN=args.MAX_LEN,
                                   pretrained_model=args.pretrained_model,
                                   mode="test")
    print("----loading data done----")
    return dataset_train, dataset_test, ent_vocab
Beispiel #3
0
def load_AASC_graph_data(args):
    def extract_by_frequency(path_train, path_test, frequency):
        dftrain = pd.read_csv(path_train, quotechar="'")
        dftest = pd.read_csv(path_test, quotechar="'")
        source_cut_train = dftrain[[
            'target_id', 'source_id'
        ]].drop_duplicates(subset=['target_id', 'source_id'])
        source_cut_test = dftest[[
            'target_id', 'source_id'
        ]].drop_duplicates(subset=['target_id', 'source_id'])
        ftrain_fre = open(
            path_train[:-4] + "_frequency" + str(frequency) + ".csv", "w")
        ftest_fre = open(
            path_test[:-4] + "_frequency" + str(frequency) + ".csv", "w")
        wtrain = csv.writer(ftrain_fre, quotechar="'")
        wtest = csv.writer(ftest_fre, quotechar="'")
        wtrain.writerow([
            "target_id", "left_citated_text", "right_citated_text", "source_id"
        ])
        wtest.writerow([
            "target_id", "left_citated_text", "right_citated_text", "source_id"
        ])
        source_train_keys = source_cut_train.source_id.value_counts().keys()
        source_test_keys = source_cut_test.source_id.value_counts().keys()
        dic1 = {}
        train_counts = source_cut_train.source_id.value_counts()
        test_counts = source_cut_test.source_id.value_counts()
        for key in source_train_keys:
            dic1[key] = train_counts[key]
        for key in source_test_keys:
            if key in dic1:
                dic1[key] += test_counts[key]
            else:
                dic1[key] = test_counts[key]
        frequencylist = []
        for key in dic1:
            if dic1[key] >= frequency:
                frequencylist.append(key)
        dftrain = dftrain.loc[dftrain["source_id"].isin(frequencylist)]
        dftest = dftest.loc[dftest["source_id"].isin(frequencylist)]
        for target_id, left_citated_text, right_citated_text, source_id in zip(
                dftrain["target_id"], dftrain["left_citated_text"],
                dftrain["right_citated_text"], dftrain["source_id"]):
            wtrain.writerow(
                [target_id, left_citated_text, right_citated_text, source_id])
        ftrain_fre.close()
        for target_id, left_citated_text, right_citated_text, source_id in zip(
                dftest["target_id"], dftest["left_citated_text"],
                dftest["right_citated_text"], dftest["source_id"]):
            wtest.writerow(
                [target_id, left_citated_text, right_citated_text, source_id])
        ftest_fre.close()
        entitylist = list(
            set(
                list(dftrain["source_id"].values) +
                list(dftrain["target_id"].values) +
                list(dftest["source_id"].values) +
                list(dftest["target_id"].values)))
        entitylist.sort()
        entvocab = {"UNKNOWN": 0, "MASK": 1}
        for i, entity in enumerate(entitylist):
            entvocab[entity] = i + 2
        return path_train[:-4] + "_frequency" + str(
            frequency) + ".csv", path_test[:-4] + "_frequency" + str(
                frequency) + ".csv", entvocab

    path = settings.citation_recommendation_dir
    path_train = os.path.join(path, "train.csv")
    path_test = os.path.join(path, "test.csv")
    path_emb_train = os.path.join(path, "scibert_AASCtrain.npy")
    path_emb_test = os.path.join(path, "scibert_AASCtest.npy")
    entvocab = build_ent_vocab(path_train)
    matrix_train = makecitationmatrix_AASC(path_train, path_emb_train,
                                           entvocab)
    matrix_test = makecitationmatrix_AASC(path_test, path_emb_test, entvocab)
    path_train_frequency5, path_test_frequency5, entvocab_frequency5 = extract_by_frequency(
        path_train, path_test, args.frequency)
    if args.train_data == "full":
        dataset_train = AASCDataSet(path_train,
                                    ent_vocab=entvocab,
                                    MAX_LEN=args.MAX_LEN,
                                    matrix=matrix_train)
    else:
        dataset_train = AASCDataSet(path_train_frequency5,
                                    ent_vocab=entvocab,
                                    MAX_LEN=args.MAX_LEN,
                                    matrix=matrix_train)
    if args.test_data == "full":
        dataset_test = AASCDataSet(path_test,
                                   ent_vocab=entvocab,
                                   MAX_LEN=args.MAX_LEN,
                                   matrix=matrix_test,
                                   mode="test")
    else:
        dataset_test = AASCDataSet(path_test_frequency5,
                                   ent_vocab=entvocab,
                                   MAX_LEN=args.MAX_LEN,
                                   matrix=matrix_test,
                                   mode="test")
    #return dataset_train,dataset_test,entvocab
    return dataset_train, dataset_test, entvocab, matrix_train, matrix_test