def trans_text(text): raw_trans = Translator().translate(text, dest='ja').text original = [sentence for sentence in text.split('\n') if sentence != ''] translated = [ sentence for sentence in raw_trans.split('\n') if sentence != '' ] return '\n'.join([f'{t1}\n{t2}\n' for t1, t2 in zip(original, translated)])
def data_augmentation(source_folder="segments20191217"): """ we paraphrase the data with back-translation :param source_folder: :param target_folder: :return: """ for scenario in scenario_s: print('\n' + scenario) paraphrased_stories = list() source_reader = open(os.path.join(source_folder, scenario), 'r') target_writer = open(os.path.join(target_folder, scenario), 'w') segment_buffer, event_buffer = list(), list() for line in source_reader: if line.find('<end_of_story>') == -1: # process normal line event, segment = line.split('\t') event_buffer.append(event) segment_buffer.append(segment) else: # process end of story line # note: target story is without <end_of_story> source_story = ' # '.join(segment_buffer) for language in language_s: target_text = Translator().translate(text=source_story, dest=language).text back_translated_text = Translator().translate(text=target_text, dest='EN').text # try: # target_text = translator.translate(text=source_story, dest=language).text # back_translated_text = translator.translate(text=target_text, dest='en').text # except json.decoder.JSONDecodeError: # print('+', end='') # continue target_story = back_translated_text.split('#') paraphrased_stories.append((target_story, event_buffer)) segment_buffer, event_buffer = list(), list() for _ in range(2): paraphrased_stories.append((source_story, event_buffer)) print('.', end='') random.shuffle(paraphrased_stories) for story, agenda in paraphrased_stories: for index, segment in enumerate(story): target_writer.write('{}\t{}\n'.format(agenda[index], segment)) target_writer.write('<end_of_story>\n') source_reader.close() target_writer.close()