def reproducefile(filename): trainpath = 'data/twitter.txt' train_object = open(trainpath, 'w') text2id = {} with open(filename) as f: for line in f: text = line.split('\t') # print(text[0]) assert '.' in text[0] or text[0].strip() == 'NaN' assert len(text) == 3 # s = text[1] # s1=text[2] s = utils.clarify(text[1]).lower() s1 = utils.clarify(text[2]).lower() slen = len(s.split()) s1len = len(s1.split()) if not text2id.has_key(s) and slen < 15 and slen > 1: text2id[s] = 1 if not text2id.has_key(s1) and s1len < 15 and s1len > 1: text2id[s1] = 1 for x in text2id.keys(): train_object.write(x) train_object.close()
def reproducetest(filename): trainpath = 'data/twitterdata/twitter-test.txt' trainrefer = 'data/twitterdata/twitter-refer.txt' train_object = open(trainpath, 'w') train_object_refer = open(trainrefer, 'w') text2id = {} tests = [] refers = [] with open(filename) as f: for line in f: text = line.split('\t') if len(text) == 4: pass else: continue tup = text[2] if int(tup[1]) <= int(tup[3]) / 2: continue s = utils.clarify(text[0]).lower().strip().strip('.') s1 = utils.clarify(text[1]).lower().strip().strip('.') if len(s.split()) > 15: temp = s s = s1 s1 = temp if len(s.split()) > 15: continue tests.append(s) refers.append(s1) for x, y in zip(tests, refers): train_object.write(x) train_object.write('\n') train_object_refer.write(y) train_object_refer.write('\n') train_object.close() train_object_refer.close()
def generate_twitter_test(filename): trainpath = 'data/twitterdata/twitter-test.txt' trainrefer = 'data/twitterdata/twitter-refer.txt' train_object = open(trainpath, 'w') train_object_refer = open(trainrefer, 'w') text2id = {} tests = [] refers = [] test2refer = {} with open(filename) as f: for line in f: text = line.split('\t') if len(text) == 4: pass else: continue tup = text[2] if int(tup[1]) <= int(tup[3]) / 2: continue s = utils.clarify(text[0]).lower().strip().strip('.') s1 = utils.clarify(text[1]).lower().strip().strip('.') s1 = s1.replace('#', ' ') s = ' '.join(s.split()[:15]) if test2refer.has_key(s): test2refer[s].append(s1) else: test2refer[s] = s1 for x, y in test2refer.items(): train_object.write(x) train_object.write('\n') train_object_refer.write(' # '.join(y)) train_object_refer.write('\n') train_object.close() train_object_refer.close()
import utils import random file_name = 'data/quora/train.csv' trainpath = 'data/train.txt' testpath = 'data/test.txt' referpath = 'data/refer.txt' train_object = open(trainpath, 'w') test_object = open(testpath, 'w') refer_object = open(referpath, 'w') testnum = 0 with open(file_name) as f: csv_reader = csv.reader(f, delimiter=',') title = True for row in csv_reader: ques1 = utils.clarify(row[3]).lower() ques2 = utils.clarify(row[4]).lower() if row[5] == '1' and testnum < 30000 and len( ques1.split()) < 15 and random.random() > 0.5: test_object.write(ques1) test_object.write('\n') refer_object.write(ques2) refer_object.write('\n') testnum += 1 else: train_object.write(ques1) train_object.write('\n') train_object.write(ques2) train_object.write('\n') test_object.close()