Beispiel #1
0
def prepare_turkcorpus():
    dataset = 'turkcorpus'
    with create_directory_or_skip(get_dataset_dir(dataset)):
        # Import here to avoid circular imports
        from access.feature_extraction import get_levenshtein_similarity
        prepare_turkcorpus_lower()
        url = 'https://github.com/cocoxu/simplification.git'
        output_dir = Path(tempfile.mkdtemp())
        git_clone(url, output_dir)
        print('Processing...')
        # Only rename files and put them in local directory architecture
        turkcorpus_truecased_dir = output_dir / 'data/turkcorpus/truecased'
        for (old_phase, new_phase) in [('test', 'test'), ('tune', 'valid')]:
            # (1) read the .tsv for which each line is tab separated:
            #     `idx, complex_sentence, *turk_sentences = line.split('\t')`
            # (2) replace lrb and rrb, tokenize
            # (3) Turk sentences are shuffled for each sample so need to realign them with turkcorpus lower
            tsv_filepath = turkcorpus_truecased_dir / f'{old_phase}.8turkers.organized.tsv'
            output_complex_filepath = get_data_filepath(
                dataset, new_phase, 'complex')
            output_ref_filepaths = [
                get_data_filepath(dataset, new_phase, 'simple.turk', i)
                for i in range(8)
            ]
            # These files will be used to reorder the shuffled ref sentences
            ordered_ref_filepaths = [
                get_data_filepath('turkcorpus_lower', new_phase, 'simple.turk',
                                  i) for i in range(8)
            ]
            with write_lines_in_parallel([output_complex_filepath] +
                                         output_ref_filepaths) as files:
                input_filepaths = [tsv_filepath] + ordered_ref_filepaths
                for tsv_line, *ordered_ref_sentences in yield_lines_in_parallel(
                        input_filepaths):
                    sample_id, complex_sentence, *shuffled_ref_sentences = [
                        word_tokenize(normalize_quotes(replace_lrb_rrb(s)))
                        for s in tsv_line.split('\t')
                    ]
                    reordered_sentences = []
                    for ordered_ref_sentence in ordered_ref_sentences:
                        # Find the position of the ref_sentence in the shuffled sentences
                        similarities = [
                            get_levenshtein_similarity(
                                ordered_ref_sentence.replace(' ', ''),
                                shuffled_ref_sentence.lower().replace(' ', ''))
                            for shuffled_ref_sentence in shuffled_ref_sentences
                        ]
                        idx = np.argmax(similarities)
                        # A few sentences have differing punctuation marks
                        assert similarities[idx] > 0.98, \
                            f'{ordered_ref_sentence} != {shuffled_ref_sentences[idx].lower()} {similarities[idx]:.2f}'
                        reordered_sentences.append(
                            shuffled_ref_sentences.pop(idx))
                    assert len(shuffled_ref_sentences) == 0
                    assert len(reordered_sentences) == 8
                    files.write([complex_sentence] + reordered_sentences)
    return dataset
def transform(input_text,
              LengthRatioProcessor=0.95,
              LevenshteinPreprocessor=0.75,
              WordRankRatioPreprocessor=0.75,
              SentencePiecePreprocessor=10000):
    input_lines = input_text.split("\n")
    # Read from input
    source_filepath = get_temp_filepath()
    write_lines([word_tokenize(line) for line in input_lines], source_filepath)

    # Load best model
    best_model_dir = prepare_models()
    recommended_preprocessors_kwargs = {
        'LengthRatioPreprocessor': {
            'target_ratio': LengthRatioProcessor
        },
        'LevenshteinPreprocessor': {
            'target_ratio': LevenshteinPreprocessor
        },
        'WordRankRatioPreprocessor': {
            'target_ratio': WordRankRatioPreprocessor
        },
        'SentencePiecePreprocessor': {
            'vocab_size': SentencePiecePreprocessor
        },
    }
    preprocessors = get_preprocessors(recommended_preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(best_model_dir, beam=1)
    simplifier = get_preprocessed_simplifier(simplifier,
                                             preprocessors=preprocessors)
    # Simplify
    pred_filepath = get_temp_filepath()

    with mute():
        simplifier(source_filepath, pred_filepath)
    return list(yield_lines(pred_filepath))
Beispiel #3
0
# -- end fix path --

import fileinput

from access.preprocessors import get_preprocessors
from access.resources.prepare import prepare_models
from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier
from access.text import word_tokenize
from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute


if __name__ == '__main__':
    # Usage: python generate.py < my_file.complex
    # Read from stdin
    source_filepath = get_temp_filepath()
    write_lines([word_tokenize(line) for line in fileinput.input()], source_filepath)
    # Load best model
    best_model_dir = prepare_models()
    recommended_preprocessors_kwargs = {
        'LengthRatioPreprocessor': {'target_ratio': 0.95},
        'LevenshteinPreprocessor': {'target_ratio': 0.75},
        'WordRankRatioPreprocessor': {'target_ratio': 0.75},
        'SentencePiecePreprocessor': {'vocab_size': 10000},
    }
    preprocessors = get_preprocessors(recommended_preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(best_model_dir, beam=8)
    simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors)
    # Simplify
    pred_filepath = get_temp_filepath()
    with mute():
        simplifier(source_filepath, pred_filepath)
Beispiel #4
0
#

import fileinput
import sys

from access.preprocessors import get_preprocessors
from access.resources.prepare import prepare_models
from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier
from access.text import word_tokenize
from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute

if __name__ == '__main__':
    # Usage: python generate.py < my_file.complex
    # Read from stdin
    source_filepath = get_temp_filepath()
    write_lines([word_tokenize(line) for line in fileinput.input()],
                source_filepath)
    # Load best model
    best_model_dir = prepare_models()
    recommended_preprocessors_kwargs = {
        'LengthRatioPreprocessor': {
            'target_ratio': 0.25
        },
        'LevenshteinPreprocessor': {
            'target_ratio': 0.25
        },
        'WordRankRatioPreprocessor': {
            'target_ratio': 0.25
        },
        #   'SentencePiecePreprocessor': {'vocab_size': 10000},
    }