def prepare_turkcorpus(): dataset = 'turkcorpus' with create_directory_or_skip(get_dataset_dir(dataset)): # Import here to avoid circular imports from access.feature_extraction import get_levenshtein_similarity prepare_turkcorpus_lower() url = 'https://github.com/cocoxu/simplification.git' output_dir = Path(tempfile.mkdtemp()) git_clone(url, output_dir) print('Processing...') # Only rename files and put them in local directory architecture turkcorpus_truecased_dir = output_dir / 'data/turkcorpus/truecased' for (old_phase, new_phase) in [('test', 'test'), ('tune', 'valid')]: # (1) read the .tsv for which each line is tab separated: # `idx, complex_sentence, *turk_sentences = line.split('\t')` # (2) replace lrb and rrb, tokenize # (3) Turk sentences are shuffled for each sample so need to realign them with turkcorpus lower tsv_filepath = turkcorpus_truecased_dir / f'{old_phase}.8turkers.organized.tsv' output_complex_filepath = get_data_filepath( dataset, new_phase, 'complex') output_ref_filepaths = [ get_data_filepath(dataset, new_phase, 'simple.turk', i) for i in range(8) ] # These files will be used to reorder the shuffled ref sentences ordered_ref_filepaths = [ get_data_filepath('turkcorpus_lower', new_phase, 'simple.turk', i) for i in range(8) ] with write_lines_in_parallel([output_complex_filepath] + output_ref_filepaths) as files: input_filepaths = [tsv_filepath] + ordered_ref_filepaths for tsv_line, *ordered_ref_sentences in yield_lines_in_parallel( input_filepaths): sample_id, complex_sentence, *shuffled_ref_sentences = [ word_tokenize(normalize_quotes(replace_lrb_rrb(s))) for s in tsv_line.split('\t') ] reordered_sentences = [] for ordered_ref_sentence in ordered_ref_sentences: # Find the position of the ref_sentence in the shuffled sentences similarities = [ get_levenshtein_similarity( ordered_ref_sentence.replace(' ', ''), shuffled_ref_sentence.lower().replace(' ', '')) for shuffled_ref_sentence in shuffled_ref_sentences ] idx = np.argmax(similarities) # A few sentences have differing punctuation marks assert similarities[idx] > 0.98, \ f'{ordered_ref_sentence} != {shuffled_ref_sentences[idx].lower()} {similarities[idx]:.2f}' reordered_sentences.append( shuffled_ref_sentences.pop(idx)) assert len(shuffled_ref_sentences) == 0 assert len(reordered_sentences) == 8 files.write([complex_sentence] + reordered_sentences) return dataset
def transform(input_text, LengthRatioProcessor=0.95, LevenshteinPreprocessor=0.75, WordRankRatioPreprocessor=0.75, SentencePiecePreprocessor=10000): input_lines = input_text.split("\n") # Read from input source_filepath = get_temp_filepath() write_lines([word_tokenize(line) for line in input_lines], source_filepath) # Load best model best_model_dir = prepare_models() recommended_preprocessors_kwargs = { 'LengthRatioPreprocessor': { 'target_ratio': LengthRatioProcessor }, 'LevenshteinPreprocessor': { 'target_ratio': LevenshteinPreprocessor }, 'WordRankRatioPreprocessor': { 'target_ratio': WordRankRatioPreprocessor }, 'SentencePiecePreprocessor': { 'vocab_size': SentencePiecePreprocessor }, } preprocessors = get_preprocessors(recommended_preprocessors_kwargs) simplifier = get_fairseq_simplifier(best_model_dir, beam=1) simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors) # Simplify pred_filepath = get_temp_filepath() with mute(): simplifier(source_filepath, pred_filepath) return list(yield_lines(pred_filepath))
# -- end fix path -- import fileinput from access.preprocessors import get_preprocessors from access.resources.prepare import prepare_models from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier from access.text import word_tokenize from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute if __name__ == '__main__': # Usage: python generate.py < my_file.complex # Read from stdin source_filepath = get_temp_filepath() write_lines([word_tokenize(line) for line in fileinput.input()], source_filepath) # Load best model best_model_dir = prepare_models() recommended_preprocessors_kwargs = { 'LengthRatioPreprocessor': {'target_ratio': 0.95}, 'LevenshteinPreprocessor': {'target_ratio': 0.75}, 'WordRankRatioPreprocessor': {'target_ratio': 0.75}, 'SentencePiecePreprocessor': {'vocab_size': 10000}, } preprocessors = get_preprocessors(recommended_preprocessors_kwargs) simplifier = get_fairseq_simplifier(best_model_dir, beam=8) simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors) # Simplify pred_filepath = get_temp_filepath() with mute(): simplifier(source_filepath, pred_filepath)
# import fileinput import sys from access.preprocessors import get_preprocessors from access.resources.prepare import prepare_models from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier from access.text import word_tokenize from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute if __name__ == '__main__': # Usage: python generate.py < my_file.complex # Read from stdin source_filepath = get_temp_filepath() write_lines([word_tokenize(line) for line in fileinput.input()], source_filepath) # Load best model best_model_dir = prepare_models() recommended_preprocessors_kwargs = { 'LengthRatioPreprocessor': { 'target_ratio': 0.25 }, 'LevenshteinPreprocessor': { 'target_ratio': 0.25 }, 'WordRankRatioPreprocessor': { 'target_ratio': 0.25 }, # 'SentencePiecePreprocessor': {'vocab_size': 10000}, }