def load_pair_name(dir,pair_id_dir): pair_mid=dataset_function.read_folder(pair_id_dir) list=dataset_function.read_file(dir) #['test','train] train_dataset=[] test_dataset=[] for i in list: if i=='train_dataset': path=dir+'/'+i label=dataset_function.read_file(path) #['neg','pos',...] for j in label: path2=path+'/'+j product=dataset_function.read_file(path2) #['p1','p2',...] for k in product: text_file = open(path2+'/'+k, "r") lines = text_file.readlines() # list of pairs: ['pair1/n','pair2/n',...] train_dataset_pair=[] for l in lines: # omit '/n' train_dataset_pair.append(l.split('\n')[0]) #print(train_dataset_pair) for m in train_dataset_pair: # match pair and find all mention_ids for pairs dic_product=pair_mid[k.split('.')[0]] # key=product dic_label=dic_product[j+'.json'] # key= label for n in dic_label[m]: train_dataset.append(n) # list of mention ids #print(len(train_dataset)) else: path=dir+'/'+i label=dataset_function.read_file(path) #['neg','pos',...] for j in label: #print(j) path2=path+'/'+j product=dataset_function.read_file(path2) #['p1','p2',...] for k in product: #print(k) text_file = open(path2+'/'+k, "r") lines = text_file.readlines() test_dataset_pair=[] for l in lines: test_dataset_pair.append(l.split('\n')[0]) for m in test_dataset_pair: #print(m) dic_product=pair_mid[k.split('.')[0]] # key=product dic_label=dic_product[j+'.json'] # key= label for n in dic_label[m]: test_dataset.append(n) #print(len(test_dataset)) #print(len(test_dataset_pair)) return train_dataset,test_dataset
def neg_ids(path): pro_name=dataset_function.read_file(path) neg_train=[] neg_test=[] for i in pro_name: pair=[] path_pair='/projects/blstm/Jiaxin_Liu/data/mention_id/pair_mid'+'/'+i+'/'+'negative'+'.'+'json' json_data=open(path_pair).read() pair_id = json.loads(json_data) path_neg_train='/projects/blstm/new_dataset/train_dataset/negative'+'/'+i+'.'+'txt' path_neg_test='/projects/blstm/new_dataset/test_dataset/negative'+'/'+i+'.'+'txt' # train with open(path_neg_train) as fp: for line in fp: pair.append(line.split( )[0]) for i in pair: neg_train.append(pair_id[i][0]) # test pair=[] with open(path_neg_test) as fp: for line in fp: pair.append(line.split( )[0]) for i in pair: neg_test.append(pair_id[i][0]) #print('neg_train:',len(neg_train)) #print('neg_test:',len(neg_test)) return neg_train,neg_test # return mentions ids
def generate_id_mention(path): pro_name=dataset_function.read_file(path) id_mention_ori={} for i in pro_name: id_mention_pos='/projects/blstm/Jiaxin_Liu/data/mention_id/mid_sentence/'+i+'/'+'positive'+'.'+'json' id_mention_neg='/projects/blstm/Jiaxin_Liu/data/mention_id/mid_sentence/'+i+'/'+'negative'+'.'+'json' json_data=open(id_mention_pos).read() data_pos = json.loads(json_data) #print(data_pos['b3d6250f8c0e2d723e032191144e6c20.(3, 15).Apex']) json_data=open(id_mention_neg).read() data_neg = json.loads(json_data) for j in data_pos.keys(): id_mention_ori[j]=data_pos[j] for j in data_neg.keys(): id_mention_ori[j]=data_neg[j] return id_mention_ori
def pos_neg_ids(path): pro_name=dataset_function.read_file(path) neg_train=[] pos_train=[] neg_test=[] pos_test=[] for i in pro_name: path2=path+'/'+i+'/'+'positive'+'.'+'json' json_data=open(path2).read() data = json.loads(json_data) #pairs=[i for i in data.keys()] #print(len(data.keys())) pair=[] path3='/projects/blstm/new_dataset/train_dataset/positive'+'/'+i+'.'+'txt' path4='/projects/blstm/new_dataset/test_dataset/positive'+'/'+i+'.'+'txt' with open(path3) as fp: for line in fp: pair.append(line.split( )[0]) #pairs.remove(line.split( )[0]) # train #print(len(pair)) c=0 for j in pair: #print(i) id_label=data[j] #print(id_label) for k in id_label: #print(k[1][0]) if k[1][0]==1: pos_train.append(k[0]) else: #print('it is 0') neg_train.append(k[0]) #c=c+1 #print(c) #print(k[0]) # test pair=[] c=0 with open(path4) as fp: for line in fp: pair.append(line.split( )[0]) #pairs.remove(line.split( )[0]) #print(len(pair)) #print('*') #print(len(pair)) for j in pair: id_label=data[j] for k in id_label: #print(k[1][0]) #print(j[1][0]) if k[1][0]==1: pos_test.append(k[0]) else: #print("it is 0") neg_test.append(k[0]) #c=c+1 #print(c) #print(pairs) #print('total_pos:',len(pos_train)+len(pos_test)) #print('total_neg:',len(neg_train)+len(neg_test)) return neg_train,pos_train,neg_test,pos_test # return mentions ids