def tail(self, parameters, overwrite=False): """Take the last n lines from file(s)""" outfiles = [ os.path.join(self.output_dir, fname) for fname in parameters['outputs'] ] infiles = [ os.path.join(self.output_dir, fname) for fname in parameters['inputs'] ] if len(outfiles) != len(infiles): raise ConfigurationError( "Number of input and output files should match in head") if not overwrite and all( os.path.isfile(outfile) for outfile in outfiles): logger.info("Output files exists, skipping step") return n = parameters['n'] for infile, outfile in zip(infiles, outfiles): logger.info("Processing file %s", infile) with file_open(infile, 'r') as inf, file_open(outfile, 'w') as outf: tmp = [] for line in tqdm(inf): tmp.append(line) if len(tmp) > n: tmp.pop(0) for line in tmp: outf.write(line)
def filter_data(self, parameters, overwrite=False): """Write sentences to file if they pass given filters""" src_out = os.path.join(self.output_dir, parameters['src_output']) tgt_out = os.path.join(self.output_dir, parameters['tgt_output']) if not overwrite and os.path.isfile(src_out) and os.path.isfile(tgt_out): logger.info("Output files exists, skipping step") return fixed_params = self.fix_filter_file_paths(parameters['filters']) filter_pipe = pipeline.FilterPipeline.from_config(fixed_params) filterfalse = parameters.get('filterfalse', False) pairs_gen = tqdm(self.get_pairs( parameters['src_input'], parameters['tgt_input'])) if filterfalse: pairs = filter_pipe.filterfalse(pairs_gen) else: pairs = filter_pipe.filter(pairs_gen) limit = parameters.get('limit') with file_open(src_out, 'w') as source_file, \ file_open(tgt_out, 'w') as target_file: for idx, pair in enumerate(pairs): source_file.write(pair[0]+'\n') target_file.write(pair[1]+'\n') source_file.flush() target_file.flush() if limit and idx >= limit - 1: break if not limit: removed = pairs_gen.n - idx logger.info("Filtered out {} / {} = {:.2f}% lines".format( removed, pairs_gen.n, 100 * removed / pairs_gen.n))
def slice(self, parameters, overwrite=False): """Take slice from file(s)""" outfiles = [ os.path.join(self.output_dir, fname) for fname in parameters['outputs'] ] infiles = [ os.path.join(self.output_dir, fname) for fname in parameters['inputs'] ] if len(outfiles) != len(infiles): raise ConfigurationError( "Number of input and output files should match in head") if not overwrite and all( os.path.isfile(outfile) for outfile in outfiles): logger.info("Output files exists, skipping step") return start = parameters.get('start', 0) stop = parameters.get('stop') step = parameters.get('step', 1) for infile, outfile in zip(infiles, outfiles): logger.info("Processing file %s", infile) with file_open(infile, 'r') as inf, file_open(outfile, 'w') as outf: for line in tqdm(itertools.islice(inf, start, stop, step)): outf.write(line)
def sort_files(self, parameters, overwrite=False): """Sort file(s) by values read from other file""" outfiles = [os.path.join(self.output_dir, fname) for fname in parameters['outputs']] infiles = [os.path.join(self.output_dir, fname) for fname in parameters['inputs']] if len(outfiles) != len(infiles): raise ConfigurationError("Number of input and output files should match in sort") if not overwrite and all(os.path.isfile(outfile) for outfile in outfiles): logger.info("Output files exists, skipping step") return valuefile = os.path.join(self.output_dir, parameters['values']) reverse = parameters.get('reverse', False) key = parameters.get('key') typeconv = parameters.get('type') if typeconv is not None: typeconv = {'float': float, 'int': int, 'str': str}[typeconv] combine = parameters.get('combine_operator') with file_open(valuefile, 'r') as fobj: logger.info("Reading values from %s", valuefile) values = [x for x in tqdm( self._read_values(fobj, key=key, conv=typeconv, combine=combine))] order = list(np.argsort(values)) if reverse: order.reverse() for infile, outfile in zip(infiles, outfiles): logger.info("Sorting file %s", infile) with file_open(infile, 'r') as fobj: lines = [line.rstrip() for line in tqdm(fobj)] with file_open(outfile, 'w') as fobj: for idx in tqdm(order): fobj.write(lines[idx] + '\n')
def pair_generator(source_file_name, target_file_name, src_tokenizer=None, tgt_tokenizer=None): """Yield and optionally tokenize sentence pairs from given files""" src_tokenize = tokenization.get_tokenize(src_tokenizer) tgt_tokenize = tokenization.get_tokenize(tgt_tokenizer) with file_open(source_file_name) as source_file, \ file_open(target_file_name) as target_file: for src_line in source_file: tgt_line = target_file.readline() yield (src_tokenize(src_line.rstrip()), tgt_tokenize(tgt_line.rstrip()))
def remove_duplicates(self, parameters, overwrite=False): """Remove duplicates from parallel lines in files""" outfiles = [ os.path.join(self.output_dir, fname) for fname in parameters['outputs'] ] infiles = [ os.path.join(self.output_dir, fname) for fname in parameters['inputs'] ] if len(outfiles) != len(infiles): raise ConfigurationError( "Number of input and output files should match in remove_duplicates" ) if not overwrite and all( os.path.isfile(outfile) for outfile in outfiles): logger.info("Output files exists, skipping step") return hashname = parameters.get('hash', 'xx_64') if hashname and not hasattr(pyhash, hashname): raise ConfigurationError( "Algorithm '{}' not available from from pyhash".format( hashname)) hashfunc = getattr(pyhash, hashname)() if hashname else lambda x: x key_indices = parameters.get('compare', 'all') key_indices = list(range(len(infiles))) if key_indices == 'all' \ else sorted(key_indices) if not isinstance(key_indices, list) or \ not all(isinstance(x, int) and 0 <= x < len(infiles) for x in key_indices): raise ConfigurationError( "The compare parameter for remove_duplicates has to be 'all' or " "a list of input file indices") infs = [file_open(infile) for infile in infiles] outfs = [file_open(outfile, 'w') for outfile in outfiles] counter = collections.Counter() removed_entries = 0 total = 0 for lines in tqdm(zip(*infs)): total += 1 key = hashfunc(''.join(lines[idx] for idx in key_indices)) counter[key] += 1 if counter[key] > 1: removed_entries += 1 continue for idx, line in enumerate(lines): outfs[idx].write(line) removed_types = sum(1 for c in counter.values() if c > 1) logger.info( "Removed {} / {} = {:.2f}% duplicate lines (duplicate types: {})". format(removed_entries, total, 100 * removed_entries / total, removed_types)) for idx in range(len(infiles)): infs[idx].close() outfs[idx].close()
def concatenate(self, parameters, overwrite=False): """Concatenate files""" outfile = os.path.join(self.output_dir, parameters['output']) if not overwrite and os.path.isfile(outfile): logger.info("Output file exists, skipping step") return with file_open(outfile, 'w') as outf: for infile in parameters['inputs']: logger.info("opening %s", os.path.join(self.output_dir, infile)) with file_open(os.path.join(self.output_dir, infile)) as inf: for line in tqdm(inf): outf.write(line.rstrip() + '\n')
def split(self, parameters, overwrite=False): """Split parallel files to two subsets""" outfiles = [os.path.join(self.output_dir, fname) for fname in parameters['outputs']] outfiles_2 = [os.path.join(self.output_dir, fname) for fname in parameters['outputs_2']] \ if 'outputs_2' in parameters else [] infiles = [os.path.join(self.output_dir, fname) for fname in parameters['inputs']] if len(outfiles) != len(infiles) or (outfiles_2 and len(outfiles_2) != len(infiles)): raise ConfigurationError( "Number of input and output files should match in split") if not overwrite and all(os.path.isfile(outfile) for outfile in outfiles + outfiles_2): logger.info("Output files exists, skipping step") return divisor = parameters['divisor'] threshold = parameters.get('threshold', 1) hashname = parameters.get('hash', 'xx_64') hashseed = parameters.get('seed', 0) if not hashname: hashname = 'xx_64' if not hasattr(pyhash, hashname): raise ConfigurationError( "Algorithm '{}' not available from from pyhash".format(hashname)) hashfunc = getattr(pyhash, hashname)(seed=hashseed) key_indices = parameters.get('compare', 'all') key_indices = list(range(len(infiles))) if key_indices == 'all' \ else sorted(key_indices) if not isinstance(key_indices, list) or \ not all(isinstance(x, int) and 0 <= x < len(infiles) for x in key_indices): raise ConfigurationError( "The compare parameter for split has to be 'all' or " "a list of input file indices") infs = [file_open(infile) for infile in infiles] outfs = [file_open(outfile, 'w') for outfile in outfiles] outfs_2 = [file_open(outfile, 'w') for outfile in outfiles_2] hits = 0 total = 0 for lines in tqdm(zip(*infs)): total += 1 key = hashfunc(''.join(lines[idx] for idx in key_indices)) if key % divisor < threshold: hits += 1 for idx, line in enumerate(lines): outfs[idx].write(line) elif outfs_2: for idx, line in enumerate(lines): outfs_2[idx].write(line) logger.info( "Split {} lines to {} ({:.2f}%) and {} ({:.2f}%) lines".format( total, hits, 100 * hits / total, total - hits, 100 * (total - hits) / total)) for idx in range(len(infiles)): infs[idx].close() outfs[idx].close() if outfs_2: outfs_2[idx].close()
def test_collect_links(self): ap = AlignmentParser(file_open(self.align_path)) attrs, src_set, trg_set, src_doc, trg_doc = ap.collect_links() self.assertEqual(attrs, [{ 'id': 'SL1', 'xtargets': 's1;s1' }, { 'id': 'SL2', 'xtargets': ';s2' }]) self.assertEqual(src_set, {'s1'}) self.assertEqual(trg_set, {'s1', 's2'}) self.assertEqual( src_doc, 'en/Doyle_Arthur_Conan-Hound_of_the_Baskervilles.xml.gz') self.assertEqual( trg_doc, 'fi/Doyle_Arthur_Conan-Hound_of_the_Baskervilles.xml.gz') attrs, src_set, trg_set, src_doc, trg_doc = ap.collect_links() self.assertEqual(attrs, [{ 'id': 'SL1', 'xtargets': 's21;' }, { 'id': 'SL2', 'xtargets': 's0 s1;s2 s3' }]) self.assertEqual(src_set, {'s21', 's0', 's1'}) self.assertEqual(trg_set, {'s2', 's3'}) self.assertEqual(src_doc, 'en/2.xml.gz') self.assertEqual(trg_doc, 'fi/2.xml.gz') attrs, src_set, trg_set, src_doc, trg_doc = ap.collect_links() self.assertEqual(attrs, []) self.assertEqual(src_set, set()) self.assertEqual(trg_set, set()) self.assertEqual(src_doc, None) self.assertEqual(trg_doc, None) ap.bp.close_document()
def test_get_annotations(self): bp = BlockParser(file_open(self.books_path)) sp = SentenceParser(file_open(self.books_path)) for i in range(19): blocks = bp.get_complete_blocks() self.assertEqual(sp.get_annotations(blocks[0]), '|NN|w1.1|source|NN|NN') bp.close_document() sp.document.close() bp = BlockParser(file_open(self.books_path)) sp = SentenceParser(file_open(self.books_path), anno_attrs=['pos']) for i in range(19): blocks = bp.get_complete_blocks() self.assertEqual(sp.get_annotations(blocks[0]), '|NN') bp.close_document() sp.document.close()
def test_tag_in_parents(self): bp = BlockParser(file_open(self.books_path)) for i in range(22): blocks = bp.get_complete_blocks() self.assertTrue(bp.tag_in_parents('chunk', blocks[0])) self.assertTrue(bp.tag_in_parents('s', blocks[0])) bp.close_document()
def _get_total_lines(fname): """Return number of lines in file""" with file_open(fname) as fobj: total = -1 for total, _ in tqdm(enumerate(fobj)): pass return total + 1
def test_get_raw_tag(self): bp = BlockParser(file_open(self.os_path), data_tag='w') blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].get_raw_tag(), '<time id="T1S" value="00:00:05,897" />') blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].get_raw_tag(), '<w id="1.1">-</w>') bp.close_document()
def test_parsing_books_raw(self): bp = BlockParser(file_open(self.books_raw_path), data_tag='s') for i in range(5): blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].name, 's') self.assertEqual(blocks[0].attributes['id'], 's3') self.assertEqual(blocks[0].data, 'Victor Hugo') bp.close_document()
def test_store_sentences(self): sp = SentenceParser(file_open(self.books_path), preprocessing='xml') sp.store_sentences({'s1'}) self.assertEqual(sp.sentences['s1'][0], 'Source : Project GutenbergTranslation') sp = SentenceParser(file_open(self.books_raw_path), preprocessing='raw') sp.store_sentences({'s1'}) self.assertEqual( sp.sentences['s1'][0], 'Source: Project GutenbergTranslation: Isabel F. ' 'HapgoodAudiobook available here') sp = SentenceParser(file_open(self.os_path), preprocessing='xml') sp.store_sentences({'1'}) self.assertEqual(sp.sentences['1'][0], "- How 'd you score that ?") sp = SentenceParser(file_open(self.os_raw_path), preprocessing='raw') sp.store_sentences({'1'}) self.assertEqual(sp.sentences['1'][0], "- How'd you score that?")
def train_ngram(self, parameters, overwrite=False): """Train an n-gram language model""" model_out = os.path.join(self.output_dir, parameters['model']) if not overwrite and os.path.isfile(model_out): logger.info("Output file exists, skipping step") return data_name = parameters['data'] seg_name = data_name + '.seg.gz' tokenizer = lm.LMTokenizer(**parameters['parameters']) with file_open(os.path.join(self.output_dir, data_name), 'r') as \ infile, \ file_open(os.path.join(self.output_dir, seg_name), 'w') as \ outfile: for line in tqdm(infile): tokens = tokenizer.tokenize(line.strip()) outfile.write(' '.join(tokens) + '\n') lm.train(os.path.join(self.output_dir, seg_name), model_out, **parameters['parameters'])
def test_parsing_os_raw(self): bp = BlockParser(file_open(self.os_raw_path), data_tag='s') blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].name, 'time') self.assertEqual(blocks[0].parent.name, 's') blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].name, 's') self.assertEqual(blocks[0].data.strip(), '- How\'d you score that?') self.assertEqual(blocks[0].parent.name, 'document') bp.close_document()
def load_dataframe(data_file): """Load normalized scores dataframe from a JSON lines file""" data = [] with file_open(data_file) as dfile: for line in dfile: try: data.append(json.loads(line)) except json.decoder.JSONDecodeError as err: logger.error(line) raise err return pd.DataFrame(json_normalize(data))
def test_parsing_books(self): bp = BlockParser(file_open(self.books_path), data_tag='w') for i in range(22): blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].name, 'w') self.assertEqual(blocks[0].data, 'Project') self.assertEqual(blocks[0].attributes['tree'], 'NP') self.assertEqual(blocks[0].parent.name, 'chunk') self.assertEqual(blocks[0].parent.parent.name, 's') self.assertEqual(blocks[0].parent.parent.attributes['id'], 's1') bp.close_document()
def test_read_sentence(self): sp = SentenceParser(file_open(self.books_raw_path), preprocessing='raw') sp.store_sentences({'s1', 's2'}) self.assertEqual( sp.read_sentence(['s2'])[0], ['Hunchback of Notre-Dame']) self.assertEqual( sp.read_sentence(['s1', 's2'])[0], [ 'Source: Project GutenbergTranslation: Isabel F. ' 'HapgoodAudiobook available here', 'Hunchback of ' 'Notre-Dame' ])
def test_get_complete_blocks(self): bp = BlockParser(file_open(self.xml_path), data_tag='stamp') blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].name, 'stamp') self.assertEqual(blocks[0].data, '123') blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].name, 'child1') blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].name, 'stamp') self.assertEqual(blocks[0].data, '321') self.assertEqual(blocks[1].name, 'child2') bp.close_document()
def test_parsing_os(self): bp = BlockParser(file_open(self.os_path), data_tag='w') blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].name, 'time') self.assertEqual(blocks[0].parent.name, 's') blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].name, 'w') self.assertEqual(blocks[0].parent.name, 's') for i in range(8): blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].name, 'w') self.assertEqual(blocks[0].parent.attributes['id'], '2') bp.close_document()
def get_subset(self, parameters, overwrite=False): """Get random subset of parallel data Keeps the order of lines, unless if shuffle_target is True in parameters, in which case the target lines will be in a random order. """ src_in = os.path.join(self.output_dir, parameters['src_input']) tgt_in = os.path.join(self.output_dir, parameters['tgt_input']) src_out = os.path.join(self.output_dir, parameters['src_output']) tgt_out = os.path.join(self.output_dir, parameters['tgt_output']) if not overwrite and os.path.isfile(src_out) and os.path.isfile( tgt_out): logger.info("Output files exists, skipping step") return random.seed(parameters.get('seed', None)) size = parameters['size'] shuffle_target = parameters.get('shuffle_target', False) total = self._get_total_lines(src_in) logger.info("Sampling subset of %s lines from total %s lines", size, total) if shuffle_target: sample = random.sample(range(total), size) with file_open(src_in) as inf, \ file_open(src_out, 'w') as outf: for line in self._yield_subset(inf, sample): outf.write(line) sample = random.sample(range(total), size) with file_open(tgt_in) as inf: lines = [line for line in self._yield_subset(inf, sample)] random.shuffle(lines) with file_open(tgt_out, 'w') as outf: for line in lines: outf.write(line) else: sample = random.sample(range(total), size) with file_open(src_in) as inf, \ file_open(src_out, 'w') as outf: for line in self._yield_subset(inf, sample): outf.write(line) with file_open(tgt_in) as inf, \ file_open(tgt_out, 'w') as outf: for line in self._yield_subset(inf, sample): outf.write(line)
def load_dataframe_in_chunks(data_file, chunksize): """Yield normalized scores dataframes from a chunked JSON lines file Use instead of load_dataframe if the data is too large to fit in memory. """ with file_open(data_file) as dfile: for num, chunk in enumerate(grouper(dfile, chunksize)): data = [] for line in chunk: try: data.append(json.loads(line)) except json.decoder.JSONDecodeError as err: logger.error(line) raise err logger.info("Processing chunk %s with %s lines", num, len(data)) yield pd.DataFrame(json_normalize(data))
def write_probs(self, input_fname, output_fname, true_label=None, standardize=True, chunksize=None): """Write classification probabilities to output file""" if chunksize: dfs_tbc = load_dataframe_in_chunks(input_fname, chunksize) else: dfs_tbc = [load_dataframe(input_fname)] logger.info("Classifier labels: %s", self.classifier.classes_) with file_open(output_fname, 'w') as output: for df_tbc in dfs_tbc: df = self.standardize(df_tbc) if standardize else df_tbc probas = self.classifier.predict_proba(df[self.features]) if true_label: true_labels = df_tbc[true_label] logger.info('roc_auc: %s', roc_auc_score(true_labels, probas[:, 1])) for proba in probas[:, 1]: output.write('{0:.10f}\n'.format(proba))
def write_preds(self, input_fname, output_fname, true_label=None, standardize=True, chunksize=None): """Write predicted class labels to output file""" if chunksize: dfs_tbc = load_dataframe_in_chunks(input_fname, chunksize) else: dfs_tbc = [load_dataframe(input_fname)] logger.info("Classifier labels: %s", self.classifier.classes_) with file_open(output_fname, 'w') as output: for df_tbc in dfs_tbc: df = self.standardize(df_tbc) if standardize else df_tbc labels = self.classifier.predict(df[self.features]) if true_label: true_labels = df_tbc[true_label] logger.info('accuracy: %s', accuracy_score(true_labels, labels)) logger.info('confusion matrix:\n%s', confusion_matrix(true_labels, labels)) for label in labels: output.write('{}\n'.format(label))
def test_initialize_block_parser(self): bp = BlockParser(file_open(self.xml_path)) bp.close_document()
def test_parse_line(self): bp = BlockParser(file_open(self.xml_path)) line = bp.document.readline() bp.parse_line(line) bp.close_document()
def test_parse_document(self): bp = BlockParser(file_open(self.xml_path)) blocks = bp.get_complete_blocks() while blocks: blocks = bp.get_complete_blocks() bp.close_document()
def test_parsing_alignment(self): bp = BlockParser(file_open(self.align_path)) blocks = bp.get_complete_blocks() self.assertEqual(blocks[0].parent.name, 'linkGrp') self.assertEqual(blocks[0].attributes['xtargets'], 's1;s1') bp.close_document()