Ejemplo n.º 1
0
def split_file(input_filepath, output_filepaths, round_robin=False):
    if not round_robin:
        raise NotImplementedError(
            'Splitting files is only implemented as round robin.')
    with open_files(output_filepaths, 'w') as files:
        # We write each line to a different file in a round robin fashion
        for i, line in enumerate(yield_lines(input_filepath)):
            files[i % len(output_filepaths)].write(line + '\n')
Ejemplo n.º 2
0
 def parse_all_hypotheses(out_filepath):
     hypotheses_dict = defaultdict(list)
     for line in yield_lines(out_filepath):
         match = re.match(r'^H-(\d+)\t-?\d+\.\d+\t(.*)$', line)
         if match:
             sample_id, hypothesis = match.groups()
             hypotheses_dict[int(sample_id)].append(hypothesis)
     # Sort in original order
     return [hypotheses_dict[i] for i in range(len(hypotheses_dict))]
Ejemplo n.º 3
0
def get_word2rank(vocab_size=np.inf):
    prepare_fasttext_embeddings()
    # TODO: Decrease vocab size or load from smaller file
    word2rank = {}
    line_generator = yield_lines(FASTTEXT_EMBEDDINGS_PATH)
    next(line_generator)  # Skip the first line (header)
    for i, line in enumerate(line_generator):
        if (i + 1) > vocab_size:
            break
        word = line.split(' ')[0]
        word2rank[word] = i
    return word2rank
def transform(input_text,
              LengthRatioProcessor=0.95,
              LevenshteinPreprocessor=0.75,
              WordRankRatioPreprocessor=0.75,
              SentencePiecePreprocessor=10000):
    input_lines = input_text.split("\n")
    # Read from input
    source_filepath = get_temp_filepath()
    write_lines([word_tokenize(line) for line in input_lines], source_filepath)

    # Load best model
    best_model_dir = prepare_models()
    recommended_preprocessors_kwargs = {
        'LengthRatioPreprocessor': {
            'target_ratio': LengthRatioProcessor
        },
        'LevenshteinPreprocessor': {
            'target_ratio': LevenshteinPreprocessor
        },
        'WordRankRatioPreprocessor': {
            'target_ratio': WordRankRatioPreprocessor
        },
        'SentencePiecePreprocessor': {
            'vocab_size': SentencePiecePreprocessor
        },
    }
    preprocessors = get_preprocessors(recommended_preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(best_model_dir, beam=1)
    simplifier = get_preprocessed_simplifier(simplifier,
                                             preprocessors=preprocessors)
    # Simplify
    pred_filepath = get_temp_filepath()

    with mute():
        simplifier(source_filepath, pred_filepath)
    return list(yield_lines(pred_filepath))
Ejemplo n.º 5
0
from access.preprocessors import get_preprocessors
from access.resources.prepare import prepare_models
from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier
from access.text import word_tokenize
from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute


if __name__ == '__main__':
    # Usage: python generate.py < my_file.complex
    # Read from stdin
    source_filepath = get_temp_filepath()
    write_lines([word_tokenize(line) for line in fileinput.input()], source_filepath)
    # Load best model
    best_model_dir = prepare_models()
    recommended_preprocessors_kwargs = {
        'LengthRatioPreprocessor': {'target_ratio': 0.95},
        'LevenshteinPreprocessor': {'target_ratio': 0.75},
        'WordRankRatioPreprocessor': {'target_ratio': 0.75},
        'SentencePiecePreprocessor': {'vocab_size': 10000},
    }
    preprocessors = get_preprocessors(recommended_preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(best_model_dir, beam=8)
    simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors)
    # Simplify
    pred_filepath = get_temp_filepath()
    with mute():
        simplifier(source_filepath, pred_filepath)
    for line in yield_lines(pred_filepath):
        print(line)