def semi_supervised(samples_path, write_path, beam_search): """use reference to predict source Args: samples_path (str): The path of reference write_path (str): The path of new samples """ pred = Predict() print('vocab_size: ', len(pred.vocab)) # Randomly pick a sample in test set to predict. count = 0 semi = [] with open(samples_path, 'r') as f: for picked in f: count += 1 source, ref = picked.strip().split('<sep>') prediction = pred.predict(ref.split(), beam_search=beam_search) semi.append(prediction + ' <sep> ' + ref) if count % 100 == 0: print(count) write_samples(semi, write_path, 'a') semi = []
def semi_supervised(samples_path, write_path, beam_search): """use reference to predict source Args: samples_path (str): The path of reference write_path (str): The path of new samples """ ########################################### # TODO: module 3 task 1 # ########################################### pred = Predict() print('vocab_size:', len(pred.vocab)) count = 0 semi = [] with open(samples_path, 'r') as f: for picked in f: count += 1 source, ref = picked.strip().split('<sep>') prediction = pred.predict(ref.split(), beam_search=beam_search) # 拼接ref的预测结果与ref,形成新的样本 semi.append(prediction + ' <sep> ' + ref) if count % 100 == 0: print(count) write_samples(semi, write_path, 'a') semi = []
def translate_continue(sample_path, translate_path): """translate original file to new file Args: sample_path (str): original file path translate_path (str): target file path Returns: (str): result of back translation """ # if file is exist open it ,get length,otherwise build it if os.path.exists(translate_path): with open(translate_path, 'r+', encoding='utf8') as file: exit_len = len(list(file)) else: with open(translate_path, 'w', encoding='utf8') as file: exit_len = 0 translated = [] count = 0 with open(sample_path, 'r', encoding='utf8') as file: for line in file: count += 1 print(count) if count <= exit_len or count == 21585: continue # source back_translate and ref back_translate source, ref = tuple(line.strip().split('<sep>')) source = back_translate(source.strip()) if not source: time.sleep(1.5) continue ref = back_translate(ref.strip()) if not ref: time.sleep(1.5) continue source = ' '.join(list(jieba.cut(source))) ref = ' '.join(list(jieba.cut(ref))) translated.append(source + ' <sep> ' + ref) # storage back_translate result if count % 10 == 0: print(count) write_samples(translated, translate_path, 'a') translated = [] if count == 1010: # write_samples(translated, translate_path, 'a') break
def generate_samples(self, write_path): """generate new samples file Args: write_path (str): new samples file path """ replaced = [] count = 0 for sample, token_list, doc in zip(self.samples, self.refs, self.corpus): count += 1 if count % 100 == 0: print(count) write_samples(replaced, write_path, 'a') replaced = [] replaced.append( sample.split('<sep>')[0] + ' <sep> ' + self.replace(token_list, doc) )
def translate_continue(sample_path, translate_path): """translate original file to new file Args: sample_path (str): original file path translate_path (str): target file path Returns: (str): result of back translation """ ########################################### # TODO: module 2 task 3 # ########################################### if os.path.exists(translate_path): with open(translate_path, 'r+', encoding='urf-8') as file: exit_len = len(list(file)) else: # with open(translate_path, 'w', encoding='urf-8') as file: exit_len = 0 translated = [] count = 0 with open(curPath + sample_path, 'r', encoding='utf-8') as file: for line in file: count += 1 print(count) if count <= exit_len or count == 21585: continue source, ref = tuple(line.strip().split('<sep>')) source = back_translate(source.strip()) ref = back_translate(ref.strip()) source = ' '.join(list(jieba.cut(source))) ref = ' '.join(list(jieba.cut(ref))) translated.append(source + ' <sep> ' + ref) if count % 10 == 0: print(count) write_samples(translated, translate_path, 'a') translated = [] if count == 1000: break
def generate_samples(self, write_path): """generate new samples file 通过替换reference中的词生成新的reference样本 Args: write_path (str): new samples file path """ ########################################### # TODO: module 1 task 3 # ########################################### replaced = [] count = 0 for sample, token_list, doc in zip(self.samples, self.refs, self.corpus): replaced.append( sample.split('<sep>')[0] + ' <sep> ' + self.replace(token_list, doc)) count += 1 if count % 100 == 0: print(count) write_samples(replaced, write_path, 'a') replaced = []
from data_utils import write_samples, partition abs_path = pathlib.Path(__file__).parent.absolute() sys.path.append(sys.path.append(abs_path)) curPath = os.path.abspath(os.path.dirname(__file__)) + '/' samples = set() # Read json file. json_path = os.path.join(abs_path, '../files/服饰_50k.json') with open(json_path, 'r', encoding='utf8') as file: jsf = json.load(file) for jsobj in jsf.values(): title = jsobj['title'] + ' ' # Get title. kb = dict(jsobj['kb']).items() # Get attributes. kb_merged = '' for key, val in kb: kb_merged += key + ' ' + val + ' ' # Merge attributes. ocr = ' '.join(list(jieba.cut(jsobj['ocr']))) # Get OCR text. texts = [] texts.append(title + ocr + kb_merged) # Merge them. reference = ' '.join(list(jieba.cut(jsobj['reference']))) for text in texts: sample = text + '<sep>' + reference # Seperate source and reference. samples.add(sample) write_path = os.path.join(abs_path, '../files/samples.txt') write_samples(samples, write_path) partition(samples)
for line in train: if len(line.split("\t")) != 4: continue line = line.replace('\n', '').replace('\r', '') target = ' '.join(list(jieba.cut(line.split("\t")[1]))) text = ' '.join(list(jieba.cut(line.split("\t")[2]))) if line.split("\t")[3] == "FAVOR": stance = 1 elif line.split("\t")[3] == "AGAINST": stance = -1 else: stance = 0 # stance = line.split("\t")[3] t_sample = text+'\n'+target+'\n'+str(stance) train_samples.add(t_sample) train_write_path = os.path.join(abs_path, 'train-3000-seg.txt') write_samples(train_samples, train_write_path) # process test file with open(dev_file_path, 'r', encoding='utf8') as dev: for line in dev: if len(line.split("\t")) != 4: continue line = line.replace('\n', '').replace('\r', '') target = ' '.join(list(jieba.cut(line.split("\t")[1]))) text =' '.join(list(jieba.cut(line.split("\t")[2]))) if line.split("\t")[3] == "FAVOR": stance = 1 elif line.split("\t")[3] == "AGAINST": stance = -1 else: stance = 0 # stance = line.split("\t")[3]