コード例 #1
0
def load_pair_name(dir,pair_id_dir):
    pair_mid=dataset_function.read_folder(pair_id_dir)
    list=dataset_function.read_file(dir) #['test','train]
    train_dataset=[]
    test_dataset=[]
    for i in list:
        if i=='train_dataset':
            path=dir+'/'+i
            label=dataset_function.read_file(path) #['neg','pos',...]
            for j in label:
                path2=path+'/'+j
                product=dataset_function.read_file(path2) #['p1','p2',...]
                for k in product:
                    text_file = open(path2+'/'+k, "r")
                    lines = text_file.readlines()  # list of pairs: ['pair1/n','pair2/n',...]
                    train_dataset_pair=[]
                    for l in lines:  # omit '/n'
                        train_dataset_pair.append(l.split('\n')[0])
                    #print(train_dataset_pair)
                    for m in train_dataset_pair:  # match pair and find all mention_ids for pairs
                        dic_product=pair_mid[k.split('.')[0]] # key=product
                        dic_label=dic_product[j+'.json'] # key= label
                        for n in dic_label[m]:
                            train_dataset.append(n) # list of mention ids
            #print(len(train_dataset))
        else:
            path=dir+'/'+i
            label=dataset_function.read_file(path) #['neg','pos',...]
            for j in label:
                #print(j)
                path2=path+'/'+j
                product=dataset_function.read_file(path2) #['p1','p2',...]
                for k in product:
                    #print(k)
                    text_file = open(path2+'/'+k, "r")
                    lines = text_file.readlines()
                    test_dataset_pair=[]
                    for l in lines:
                        test_dataset_pair.append(l.split('\n')[0])
                    for m in test_dataset_pair:
                        #print(m)
                        dic_product=pair_mid[k.split('.')[0]] # key=product
                        dic_label=dic_product[j+'.json'] # key= label
                        for n in dic_label[m]:
                            test_dataset.append(n)
            #print(len(test_dataset))
            #print(len(test_dataset_pair))
    return train_dataset,test_dataset
def neg_ids(path):
    pro_name=dataset_function.read_file(path)
    neg_train=[]
    neg_test=[]
    for i in pro_name:
        pair=[]
        path_pair='/projects/blstm/Jiaxin_Liu/data/mention_id/pair_mid'+'/'+i+'/'+'negative'+'.'+'json'
        json_data=open(path_pair).read()
        pair_id = json.loads(json_data)
        path_neg_train='/projects/blstm/new_dataset/train_dataset/negative'+'/'+i+'.'+'txt'
        path_neg_test='/projects/blstm/new_dataset/test_dataset/negative'+'/'+i+'.'+'txt'
        # train
        with open(path_neg_train) as fp:
            for line in fp:
                pair.append(line.split( )[0])
        for i in pair:
            neg_train.append(pair_id[i][0])
        # test
        pair=[]
        with open(path_neg_test) as fp:
            for line in fp:
                pair.append(line.split( )[0])
        for i in pair:
            neg_test.append(pair_id[i][0])
        #print('neg_train:',len(neg_train))
        #print('neg_test:',len(neg_test))
    return neg_train,neg_test # return mentions ids
def generate_id_mention(path):
    pro_name=dataset_function.read_file(path)
    id_mention_ori={}
    for i in pro_name:
        id_mention_pos='/projects/blstm/Jiaxin_Liu/data/mention_id/mid_sentence/'+i+'/'+'positive'+'.'+'json'
        id_mention_neg='/projects/blstm/Jiaxin_Liu/data/mention_id/mid_sentence/'+i+'/'+'negative'+'.'+'json'
        json_data=open(id_mention_pos).read()
        data_pos = json.loads(json_data)
        #print(data_pos['b3d6250f8c0e2d723e032191144e6c20.(3, 15).Apex'])
        json_data=open(id_mention_neg).read()
        data_neg = json.loads(json_data)
        for j in data_pos.keys():
            id_mention_ori[j]=data_pos[j]
        for j in data_neg.keys():
            id_mention_ori[j]=data_neg[j]
    return id_mention_ori
def pos_neg_ids(path):
    pro_name=dataset_function.read_file(path)
    neg_train=[]
    pos_train=[]
    neg_test=[]
    pos_test=[]
    for i in pro_name:
        path2=path+'/'+i+'/'+'positive'+'.'+'json'
        json_data=open(path2).read()
        data = json.loads(json_data)
        #pairs=[i for i in data.keys()]
        #print(len(data.keys()))
        pair=[]
        path3='/projects/blstm/new_dataset/train_dataset/positive'+'/'+i+'.'+'txt'
        path4='/projects/blstm/new_dataset/test_dataset/positive'+'/'+i+'.'+'txt'
        with open(path3) as fp:
            for line in fp:
                pair.append(line.split( )[0])
                #pairs.remove(line.split( )[0])
        # train
        #print(len(pair))
        c=0
        for j in pair:
            #print(i)
            id_label=data[j]
            #print(id_label)
            for k in id_label:
                #print(k[1][0])
                if k[1][0]==1:
                    pos_train.append(k[0])
                else:
                    #print('it is 0')
                    neg_train.append(k[0])
                    #c=c+1
                    #print(c)
                    #print(k[0])
        # test
        pair=[]
        c=0
        with open(path4) as fp:
            for line in fp:
                pair.append(line.split( )[0])
                #pairs.remove(line.split( )[0])
        #print(len(pair))
        #print('*')
        #print(len(pair))
        for j in pair:
            id_label=data[j]
            for k in id_label:
                #print(k[1][0])
                #print(j[1][0])
                if k[1][0]==1:
                    pos_test.append(k[0])
                else:
                    #print("it is 0")
                    neg_test.append(k[0])
                    #c=c+1
                    #print(c)
        #print(pairs)
        #print('total_pos:',len(pos_train)+len(pos_test))
        #print('total_neg:',len(neg_train)+len(neg_test))
    return neg_train,pos_train,neg_test,pos_test # return mentions ids