def split_file(input_filepath, output_filepaths, round_robin=False): if not round_robin: raise NotImplementedError( 'Splitting files is only implemented as round robin.') with open_files(output_filepaths, 'w') as files: # We write each line to a different file in a round robin fashion for i, line in enumerate(yield_lines(input_filepath)): files[i % len(output_filepaths)].write(line + '\n')
def parse_all_hypotheses(out_filepath): hypotheses_dict = defaultdict(list) for line in yield_lines(out_filepath): match = re.match(r'^H-(\d+)\t-?\d+\.\d+\t(.*)$', line) if match: sample_id, hypothesis = match.groups() hypotheses_dict[int(sample_id)].append(hypothesis) # Sort in original order return [hypotheses_dict[i] for i in range(len(hypotheses_dict))]
def get_word2rank(vocab_size=np.inf): prepare_fasttext_embeddings() # TODO: Decrease vocab size or load from smaller file word2rank = {} line_generator = yield_lines(FASTTEXT_EMBEDDINGS_PATH) next(line_generator) # Skip the first line (header) for i, line in enumerate(line_generator): if (i + 1) > vocab_size: break word = line.split(' ')[0] word2rank[word] = i return word2rank
def transform(input_text, LengthRatioProcessor=0.95, LevenshteinPreprocessor=0.75, WordRankRatioPreprocessor=0.75, SentencePiecePreprocessor=10000): input_lines = input_text.split("\n") # Read from input source_filepath = get_temp_filepath() write_lines([word_tokenize(line) for line in input_lines], source_filepath) # Load best model best_model_dir = prepare_models() recommended_preprocessors_kwargs = { 'LengthRatioPreprocessor': { 'target_ratio': LengthRatioProcessor }, 'LevenshteinPreprocessor': { 'target_ratio': LevenshteinPreprocessor }, 'WordRankRatioPreprocessor': { 'target_ratio': WordRankRatioPreprocessor }, 'SentencePiecePreprocessor': { 'vocab_size': SentencePiecePreprocessor }, } preprocessors = get_preprocessors(recommended_preprocessors_kwargs) simplifier = get_fairseq_simplifier(best_model_dir, beam=1) simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors) # Simplify pred_filepath = get_temp_filepath() with mute(): simplifier(source_filepath, pred_filepath) return list(yield_lines(pred_filepath))
from access.preprocessors import get_preprocessors from access.resources.prepare import prepare_models from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier from access.text import word_tokenize from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute if __name__ == '__main__': # Usage: python generate.py < my_file.complex # Read from stdin source_filepath = get_temp_filepath() write_lines([word_tokenize(line) for line in fileinput.input()], source_filepath) # Load best model best_model_dir = prepare_models() recommended_preprocessors_kwargs = { 'LengthRatioPreprocessor': {'target_ratio': 0.95}, 'LevenshteinPreprocessor': {'target_ratio': 0.75}, 'WordRankRatioPreprocessor': {'target_ratio': 0.75}, 'SentencePiecePreprocessor': {'vocab_size': 10000}, } preprocessors = get_preprocessors(recommended_preprocessors_kwargs) simplifier = get_fairseq_simplifier(best_model_dir, beam=8) simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors) # Simplify pred_filepath = get_temp_filepath() with mute(): simplifier(source_filepath, pred_filepath) for line in yield_lines(pred_filepath): print(line)