Example #1
0
def trans_text(text):
    raw_trans = Translator().translate(text, dest='ja').text
    original = [sentence for sentence in text.split('\n') if sentence != '']
    translated = [
        sentence for sentence in raw_trans.split('\n') if sentence != ''
    ]
    return '\n'.join([f'{t1}\n{t2}\n' for t1, t2 in zip(original, translated)])
Example #2
0
def data_augmentation(source_folder="segments20191217"):
    """
    we paraphrase the data with back-translation
    :param source_folder:
    :param target_folder:
    :return:
    """
    for scenario in scenario_s:
        print('\n' + scenario)
        paraphrased_stories = list()
        source_reader = open(os.path.join(source_folder, scenario), 'r')
        target_writer = open(os.path.join(target_folder, scenario), 'w')
        segment_buffer, event_buffer = list(), list()
        for line in source_reader:
            if line.find('<end_of_story>') == -1:
                # process normal line
                event, segment = line.split('\t')
                event_buffer.append(event)
                segment_buffer.append(segment)
            else:
                # process end of story line
                # note: target story is without <end_of_story>
                source_story = ' # '.join(segment_buffer)
                for language in language_s:
                    target_text = Translator().translate(text=source_story, dest=language).text
                    back_translated_text = Translator().translate(text=target_text, dest='EN').text
                    # try:
                    #     target_text = translator.translate(text=source_story, dest=language).text
                    #     back_translated_text = translator.translate(text=target_text, dest='en').text
                    # except json.decoder.JSONDecodeError:
                    #     print('+', end='')
                    #     continue
                    target_story = back_translated_text.split('#')
                    paraphrased_stories.append((target_story, event_buffer))
                segment_buffer, event_buffer = list(), list()
                for _ in range(2):
                    paraphrased_stories.append((source_story, event_buffer))
                print('.', end='')
        random.shuffle(paraphrased_stories)
        for story, agenda in paraphrased_stories:
            for index, segment in enumerate(story):
                target_writer.write('{}\t{}\n'.format(agenda[index], segment))
            target_writer.write('<end_of_story>\n')
        source_reader.close()
        target_writer.close()