Exemple #1
0
 def tail(self, parameters, overwrite=False):
     """Take the last n lines from file(s)"""
     outfiles = [
         os.path.join(self.output_dir, fname)
         for fname in parameters['outputs']
     ]
     infiles = [
         os.path.join(self.output_dir, fname)
         for fname in parameters['inputs']
     ]
     if len(outfiles) != len(infiles):
         raise ConfigurationError(
             "Number of input and output files should match in head")
     if not overwrite and all(
             os.path.isfile(outfile) for outfile in outfiles):
         logger.info("Output files exists, skipping step")
         return
     n = parameters['n']
     for infile, outfile in zip(infiles, outfiles):
         logger.info("Processing file %s", infile)
         with file_open(infile, 'r') as inf, file_open(outfile,
                                                       'w') as outf:
             tmp = []
             for line in tqdm(inf):
                 tmp.append(line)
                 if len(tmp) > n:
                     tmp.pop(0)
             for line in tmp:
                 outf.write(line)
Exemple #2
0
 def filter_data(self, parameters, overwrite=False):
     """Write sentences to file if they pass given filters"""
     src_out = os.path.join(self.output_dir, parameters['src_output'])
     tgt_out = os.path.join(self.output_dir, parameters['tgt_output'])
     if not overwrite and os.path.isfile(src_out) and os.path.isfile(tgt_out):
         logger.info("Output files exists, skipping step")
         return
     fixed_params = self.fix_filter_file_paths(parameters['filters'])
     filter_pipe = pipeline.FilterPipeline.from_config(fixed_params)
     filterfalse = parameters.get('filterfalse', False)
     pairs_gen = tqdm(self.get_pairs(
         parameters['src_input'], parameters['tgt_input']))
     if filterfalse:
         pairs = filter_pipe.filterfalse(pairs_gen)
     else:
         pairs = filter_pipe.filter(pairs_gen)
     limit = parameters.get('limit')
     with file_open(src_out, 'w') as source_file, \
             file_open(tgt_out, 'w') as target_file:
         for idx, pair in enumerate(pairs):
             source_file.write(pair[0]+'\n')
             target_file.write(pair[1]+'\n')
             source_file.flush()
             target_file.flush()
             if limit and idx >= limit - 1:
                 break
     if not limit:
         removed = pairs_gen.n - idx
         logger.info("Filtered out {} / {} = {:.2f}% lines".format(
             removed, pairs_gen.n, 100 * removed / pairs_gen.n))
Exemple #3
0
 def slice(self, parameters, overwrite=False):
     """Take slice from file(s)"""
     outfiles = [
         os.path.join(self.output_dir, fname)
         for fname in parameters['outputs']
     ]
     infiles = [
         os.path.join(self.output_dir, fname)
         for fname in parameters['inputs']
     ]
     if len(outfiles) != len(infiles):
         raise ConfigurationError(
             "Number of input and output files should match in head")
     if not overwrite and all(
             os.path.isfile(outfile) for outfile in outfiles):
         logger.info("Output files exists, skipping step")
         return
     start = parameters.get('start', 0)
     stop = parameters.get('stop')
     step = parameters.get('step', 1)
     for infile, outfile in zip(infiles, outfiles):
         logger.info("Processing file %s", infile)
         with file_open(infile, 'r') as inf, file_open(outfile,
                                                       'w') as outf:
             for line in tqdm(itertools.islice(inf, start, stop, step)):
                 outf.write(line)
Exemple #4
0
 def sort_files(self, parameters, overwrite=False):
     """Sort file(s) by values read from other file"""
     outfiles = [os.path.join(self.output_dir, fname) for fname in parameters['outputs']]
     infiles = [os.path.join(self.output_dir, fname) for fname in parameters['inputs']]
     if len(outfiles) != len(infiles):
         raise ConfigurationError("Number of input and output files should match in sort")
     if not overwrite and all(os.path.isfile(outfile) for outfile in outfiles):
         logger.info("Output files exists, skipping step")
         return
     valuefile = os.path.join(self.output_dir, parameters['values'])
     reverse = parameters.get('reverse', False)
     key = parameters.get('key')
     typeconv = parameters.get('type')
     if typeconv is not None:
         typeconv = {'float': float, 'int': int, 'str': str}[typeconv]
     combine = parameters.get('combine_operator')
     with file_open(valuefile, 'r') as fobj:
         logger.info("Reading values from %s", valuefile)
         values = [x for x in tqdm(
             self._read_values(fobj, key=key, conv=typeconv, combine=combine))]
         order = list(np.argsort(values))
         if reverse:
             order.reverse()
     for infile, outfile in zip(infiles, outfiles):
         logger.info("Sorting file %s", infile)
         with file_open(infile, 'r') as fobj:
             lines = [line.rstrip() for line in tqdm(fobj)]
         with file_open(outfile, 'w') as fobj:
             for idx in tqdm(order):
                 fobj.write(lines[idx] + '\n')
Exemple #5
0
 def pair_generator(source_file_name, target_file_name,
                    src_tokenizer=None, tgt_tokenizer=None):
     """Yield and optionally tokenize sentence pairs from given files"""
     src_tokenize = tokenization.get_tokenize(src_tokenizer)
     tgt_tokenize = tokenization.get_tokenize(tgt_tokenizer)
     with file_open(source_file_name) as source_file, \
             file_open(target_file_name) as target_file:
         for src_line in source_file:
             tgt_line = target_file.readline()
             yield (src_tokenize(src_line.rstrip()), tgt_tokenize(tgt_line.rstrip()))
Exemple #6
0
 def remove_duplicates(self, parameters, overwrite=False):
     """Remove duplicates from parallel lines in files"""
     outfiles = [
         os.path.join(self.output_dir, fname)
         for fname in parameters['outputs']
     ]
     infiles = [
         os.path.join(self.output_dir, fname)
         for fname in parameters['inputs']
     ]
     if len(outfiles) != len(infiles):
         raise ConfigurationError(
             "Number of input and output files should match in remove_duplicates"
         )
     if not overwrite and all(
             os.path.isfile(outfile) for outfile in outfiles):
         logger.info("Output files exists, skipping step")
         return
     hashname = parameters.get('hash', 'xx_64')
     if hashname and not hasattr(pyhash, hashname):
         raise ConfigurationError(
             "Algorithm '{}' not available from from pyhash".format(
                 hashname))
     hashfunc = getattr(pyhash, hashname)() if hashname else lambda x: x
     key_indices = parameters.get('compare', 'all')
     key_indices = list(range(len(infiles))) if key_indices == 'all' \
         else sorted(key_indices)
     if not isinstance(key_indices, list) or \
        not all(isinstance(x, int) and 0 <= x < len(infiles) for x in key_indices):
         raise ConfigurationError(
             "The compare parameter for remove_duplicates has to be 'all' or "
             "a list of input file indices")
     infs = [file_open(infile) for infile in infiles]
     outfs = [file_open(outfile, 'w') for outfile in outfiles]
     counter = collections.Counter()
     removed_entries = 0
     total = 0
     for lines in tqdm(zip(*infs)):
         total += 1
         key = hashfunc(''.join(lines[idx] for idx in key_indices))
         counter[key] += 1
         if counter[key] > 1:
             removed_entries += 1
             continue
         for idx, line in enumerate(lines):
             outfs[idx].write(line)
     removed_types = sum(1 for c in counter.values() if c > 1)
     logger.info(
         "Removed {} / {} = {:.2f}% duplicate lines (duplicate types: {})".
         format(removed_entries, total, 100 * removed_entries / total,
                removed_types))
     for idx in range(len(infiles)):
         infs[idx].close()
         outfs[idx].close()
Exemple #7
0
 def concatenate(self, parameters, overwrite=False):
     """Concatenate files"""
     outfile = os.path.join(self.output_dir, parameters['output'])
     if not overwrite and os.path.isfile(outfile):
         logger.info("Output file exists, skipping step")
         return
     with file_open(outfile, 'w') as outf:
         for infile in parameters['inputs']:
             logger.info("opening %s", os.path.join(self.output_dir, infile))
             with file_open(os.path.join(self.output_dir, infile)) as inf:
                 for line in tqdm(inf):
                     outf.write(line.rstrip() + '\n')
Exemple #8
0
 def split(self, parameters, overwrite=False):
     """Split parallel files to two subsets"""
     outfiles = [os.path.join(self.output_dir, fname) for fname in parameters['outputs']]
     outfiles_2 = [os.path.join(self.output_dir, fname) for fname in parameters['outputs_2']] \
         if 'outputs_2' in parameters else []
     infiles = [os.path.join(self.output_dir, fname) for fname in parameters['inputs']]
     if len(outfiles) != len(infiles) or (outfiles_2 and len(outfiles_2) != len(infiles)):
         raise ConfigurationError(
             "Number of input and output files should match in split")
     if not overwrite and all(os.path.isfile(outfile) for outfile in outfiles + outfiles_2):
         logger.info("Output files exists, skipping step")
         return
     divisor = parameters['divisor']
     threshold = parameters.get('threshold', 1)
     hashname = parameters.get('hash', 'xx_64')
     hashseed = parameters.get('seed', 0)
     if not hashname:
         hashname = 'xx_64'
     if not hasattr(pyhash, hashname):
         raise ConfigurationError(
             "Algorithm '{}' not available from from pyhash".format(hashname))
     hashfunc = getattr(pyhash, hashname)(seed=hashseed)
     key_indices = parameters.get('compare', 'all')
     key_indices = list(range(len(infiles))) if key_indices == 'all' \
         else sorted(key_indices)
     if not isinstance(key_indices, list) or \
        not all(isinstance(x, int) and 0 <= x < len(infiles) for x in key_indices):
         raise ConfigurationError(
             "The compare parameter for split has to be 'all' or "
             "a list of input file indices")
     infs = [file_open(infile) for infile in infiles]
     outfs = [file_open(outfile, 'w') for outfile in outfiles]
     outfs_2 = [file_open(outfile, 'w') for outfile in outfiles_2]
     hits = 0
     total = 0
     for lines in tqdm(zip(*infs)):
         total += 1
         key = hashfunc(''.join(lines[idx] for idx in key_indices))
         if key % divisor < threshold:
             hits += 1
             for idx, line in enumerate(lines):
                 outfs[idx].write(line)
         elif outfs_2:
             for idx, line in enumerate(lines):
                 outfs_2[idx].write(line)
     logger.info(
         "Split {} lines to {} ({:.2f}%) and {} ({:.2f}%) lines".format(
             total, hits, 100 * hits / total, total - hits, 100 * (total - hits) / total))
     for idx in range(len(infiles)):
         infs[idx].close()
         outfs[idx].close()
         if outfs_2:
             outfs_2[idx].close()
 def test_collect_links(self):
     ap = AlignmentParser(file_open(self.align_path))
     attrs, src_set, trg_set, src_doc, trg_doc = ap.collect_links()
     self.assertEqual(attrs, [{
         'id': 'SL1',
         'xtargets': 's1;s1'
     }, {
         'id': 'SL2',
         'xtargets': ';s2'
     }])
     self.assertEqual(src_set, {'s1'})
     self.assertEqual(trg_set, {'s1', 's2'})
     self.assertEqual(
         src_doc, 'en/Doyle_Arthur_Conan-Hound_of_the_Baskervilles.xml.gz')
     self.assertEqual(
         trg_doc, 'fi/Doyle_Arthur_Conan-Hound_of_the_Baskervilles.xml.gz')
     attrs, src_set, trg_set, src_doc, trg_doc = ap.collect_links()
     self.assertEqual(attrs, [{
         'id': 'SL1',
         'xtargets': 's21;'
     }, {
         'id': 'SL2',
         'xtargets': 's0 s1;s2 s3'
     }])
     self.assertEqual(src_set, {'s21', 's0', 's1'})
     self.assertEqual(trg_set, {'s2', 's3'})
     self.assertEqual(src_doc, 'en/2.xml.gz')
     self.assertEqual(trg_doc, 'fi/2.xml.gz')
     attrs, src_set, trg_set, src_doc, trg_doc = ap.collect_links()
     self.assertEqual(attrs, [])
     self.assertEqual(src_set, set())
     self.assertEqual(trg_set, set())
     self.assertEqual(src_doc, None)
     self.assertEqual(trg_doc, None)
     ap.bp.close_document()
Exemple #10
0
 def test_get_annotations(self):
     bp = BlockParser(file_open(self.books_path))
     sp = SentenceParser(file_open(self.books_path))
     for i in range(19):
         blocks = bp.get_complete_blocks()
     self.assertEqual(sp.get_annotations(blocks[0]),
                      '|NN|w1.1|source|NN|NN')
     bp.close_document()
     sp.document.close()
     bp = BlockParser(file_open(self.books_path))
     sp = SentenceParser(file_open(self.books_path), anno_attrs=['pos'])
     for i in range(19):
         blocks = bp.get_complete_blocks()
     self.assertEqual(sp.get_annotations(blocks[0]), '|NN')
     bp.close_document()
     sp.document.close()
Exemple #11
0
 def test_tag_in_parents(self):
     bp = BlockParser(file_open(self.books_path))
     for i in range(22):
         blocks = bp.get_complete_blocks()
     self.assertTrue(bp.tag_in_parents('chunk', blocks[0]))
     self.assertTrue(bp.tag_in_parents('s', blocks[0]))
     bp.close_document()
Exemple #12
0
 def _get_total_lines(fname):
     """Return number of lines in file"""
     with file_open(fname) as fobj:
         total = -1
         for total, _ in tqdm(enumerate(fobj)):
             pass
     return total + 1
Exemple #13
0
 def test_get_raw_tag(self):
     bp = BlockParser(file_open(self.os_path), data_tag='w')
     blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].get_raw_tag(),
             '<time id="T1S" value="00:00:05,897" />')
     blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].get_raw_tag(), '<w id="1.1">-</w>')
     bp.close_document()
Exemple #14
0
 def test_parsing_books_raw(self):
     bp = BlockParser(file_open(self.books_raw_path), data_tag='s')
     for i in range(5):
         blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].name, 's')
     self.assertEqual(blocks[0].attributes['id'], 's3')
     self.assertEqual(blocks[0].data, 'Victor Hugo')
     bp.close_document()
Exemple #15
0
 def test_store_sentences(self):
     sp = SentenceParser(file_open(self.books_path), preprocessing='xml')
     sp.store_sentences({'s1'})
     self.assertEqual(sp.sentences['s1'][0],
                      'Source : Project GutenbergTranslation')
     sp = SentenceParser(file_open(self.books_raw_path),
                         preprocessing='raw')
     sp.store_sentences({'s1'})
     self.assertEqual(
         sp.sentences['s1'][0],
         'Source: Project GutenbergTranslation: Isabel F. '
         'HapgoodAudiobook available here')
     sp = SentenceParser(file_open(self.os_path), preprocessing='xml')
     sp.store_sentences({'1'})
     self.assertEqual(sp.sentences['1'][0], "- How 'd you score that ?")
     sp = SentenceParser(file_open(self.os_raw_path), preprocessing='raw')
     sp.store_sentences({'1'})
     self.assertEqual(sp.sentences['1'][0], "- How'd you score that?")
Exemple #16
0
 def train_ngram(self, parameters, overwrite=False):
     """Train an n-gram language model"""
     model_out = os.path.join(self.output_dir, parameters['model'])
     if not overwrite and os.path.isfile(model_out):
         logger.info("Output file exists, skipping step")
         return
     data_name = parameters['data']
     seg_name = data_name + '.seg.gz'
     tokenizer = lm.LMTokenizer(**parameters['parameters'])
     with file_open(os.path.join(self.output_dir, data_name), 'r') as \
             infile, \
             file_open(os.path.join(self.output_dir, seg_name), 'w') as \
             outfile:
         for line in tqdm(infile):
             tokens = tokenizer.tokenize(line.strip())
             outfile.write(' '.join(tokens) + '\n')
     lm.train(os.path.join(self.output_dir, seg_name), model_out,
              **parameters['parameters'])
Exemple #17
0
 def test_parsing_os_raw(self):
     bp = BlockParser(file_open(self.os_raw_path), data_tag='s')
     blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].name, 'time')
     self.assertEqual(blocks[0].parent.name, 's')
     blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].name, 's')
     self.assertEqual(blocks[0].data.strip(), '- How\'d you score that?')
     self.assertEqual(blocks[0].parent.name, 'document')
     bp.close_document()
Exemple #18
0
def load_dataframe(data_file):
    """Load normalized scores dataframe from a JSON lines file"""
    data = []
    with file_open(data_file) as dfile:
        for line in dfile:
            try:
                data.append(json.loads(line))
            except json.decoder.JSONDecodeError as err:
                logger.error(line)
                raise err
    return pd.DataFrame(json_normalize(data))
Exemple #19
0
 def test_parsing_books(self):
     bp = BlockParser(file_open(self.books_path), data_tag='w')
     for i in range(22):
         blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].name, 'w')
     self.assertEqual(blocks[0].data, 'Project')
     self.assertEqual(blocks[0].attributes['tree'], 'NP')
     self.assertEqual(blocks[0].parent.name, 'chunk')
     self.assertEqual(blocks[0].parent.parent.name, 's')
     self.assertEqual(blocks[0].parent.parent.attributes['id'], 's1')
     bp.close_document()
Exemple #20
0
 def test_read_sentence(self):
     sp = SentenceParser(file_open(self.books_raw_path),
                         preprocessing='raw')
     sp.store_sentences({'s1', 's2'})
     self.assertEqual(
         sp.read_sentence(['s2'])[0], ['Hunchback of Notre-Dame'])
     self.assertEqual(
         sp.read_sentence(['s1', 's2'])[0], [
             'Source: Project GutenbergTranslation: Isabel F. '
             'HapgoodAudiobook available here', 'Hunchback of '
             'Notre-Dame'
         ])
Exemple #21
0
 def test_get_complete_blocks(self):
     bp = BlockParser(file_open(self.xml_path), data_tag='stamp')
     blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].name, 'stamp')
     self.assertEqual(blocks[0].data, '123')
     blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].name, 'child1')
     blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].name, 'stamp')
     self.assertEqual(blocks[0].data, '321')
     self.assertEqual(blocks[1].name, 'child2')
     bp.close_document()
Exemple #22
0
 def test_parsing_os(self):
     bp = BlockParser(file_open(self.os_path), data_tag='w')
     blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].name, 'time')
     self.assertEqual(blocks[0].parent.name, 's')
     blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].name, 'w')
     self.assertEqual(blocks[0].parent.name, 's')
     for i in range(8):
         blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].name, 'w')
     self.assertEqual(blocks[0].parent.attributes['id'], '2')
     bp.close_document()
Exemple #23
0
    def get_subset(self, parameters, overwrite=False):
        """Get random subset of parallel data

        Keeps the order of lines, unless if shuffle_target is True in
        parameters, in which case the target lines will be in a random
        order.

        """
        src_in = os.path.join(self.output_dir, parameters['src_input'])
        tgt_in = os.path.join(self.output_dir, parameters['tgt_input'])
        src_out = os.path.join(self.output_dir, parameters['src_output'])
        tgt_out = os.path.join(self.output_dir, parameters['tgt_output'])
        if not overwrite and os.path.isfile(src_out) and os.path.isfile(
                tgt_out):
            logger.info("Output files exists, skipping step")
            return
        random.seed(parameters.get('seed', None))
        size = parameters['size']
        shuffle_target = parameters.get('shuffle_target', False)
        total = self._get_total_lines(src_in)
        logger.info("Sampling subset of %s lines from total %s lines", size,
                    total)
        if shuffle_target:
            sample = random.sample(range(total), size)
            with file_open(src_in) as inf, \
                 file_open(src_out, 'w') as outf:
                for line in self._yield_subset(inf, sample):
                    outf.write(line)
            sample = random.sample(range(total), size)
            with file_open(tgt_in) as inf:
                lines = [line for line in self._yield_subset(inf, sample)]
            random.shuffle(lines)
            with file_open(tgt_out, 'w') as outf:
                for line in lines:
                    outf.write(line)
        else:
            sample = random.sample(range(total), size)
            with file_open(src_in) as inf, \
                 file_open(src_out, 'w') as outf:
                for line in self._yield_subset(inf, sample):
                    outf.write(line)
            with file_open(tgt_in) as inf, \
                 file_open(tgt_out, 'w') as outf:
                for line in self._yield_subset(inf, sample):
                    outf.write(line)
Exemple #24
0
def load_dataframe_in_chunks(data_file, chunksize):
    """Yield normalized scores dataframes from a chunked JSON lines file

    Use instead of load_dataframe if the data is too large to fit in memory.

    """
    with file_open(data_file) as dfile:
        for num, chunk in enumerate(grouper(dfile, chunksize)):
            data = []
            for line in chunk:
                try:
                    data.append(json.loads(line))
                except json.decoder.JSONDecodeError as err:
                    logger.error(line)
                    raise err
            logger.info("Processing chunk %s with %s lines", num, len(data))
            yield pd.DataFrame(json_normalize(data))
Exemple #25
0
 def write_probs(self,
                 input_fname,
                 output_fname,
                 true_label=None,
                 standardize=True,
                 chunksize=None):
     """Write classification probabilities to output file"""
     if chunksize:
         dfs_tbc = load_dataframe_in_chunks(input_fname, chunksize)
     else:
         dfs_tbc = [load_dataframe(input_fname)]
     logger.info("Classifier labels: %s", self.classifier.classes_)
     with file_open(output_fname, 'w') as output:
         for df_tbc in dfs_tbc:
             df = self.standardize(df_tbc) if standardize else df_tbc
             probas = self.classifier.predict_proba(df[self.features])
             if true_label:
                 true_labels = df_tbc[true_label]
                 logger.info('roc_auc: %s',
                             roc_auc_score(true_labels, probas[:, 1]))
             for proba in probas[:, 1]:
                 output.write('{0:.10f}\n'.format(proba))
Exemple #26
0
 def write_preds(self,
                 input_fname,
                 output_fname,
                 true_label=None,
                 standardize=True,
                 chunksize=None):
     """Write predicted class labels to output file"""
     if chunksize:
         dfs_tbc = load_dataframe_in_chunks(input_fname, chunksize)
     else:
         dfs_tbc = [load_dataframe(input_fname)]
     logger.info("Classifier labels: %s", self.classifier.classes_)
     with file_open(output_fname, 'w') as output:
         for df_tbc in dfs_tbc:
             df = self.standardize(df_tbc) if standardize else df_tbc
             labels = self.classifier.predict(df[self.features])
             if true_label:
                 true_labels = df_tbc[true_label]
                 logger.info('accuracy: %s',
                             accuracy_score(true_labels, labels))
                 logger.info('confusion matrix:\n%s',
                             confusion_matrix(true_labels, labels))
             for label in labels:
                 output.write('{}\n'.format(label))
Exemple #27
0
 def test_initialize_block_parser(self):
     bp = BlockParser(file_open(self.xml_path))
     bp.close_document()
Exemple #28
0
 def test_parse_line(self):
     bp = BlockParser(file_open(self.xml_path))
     line = bp.document.readline()
     bp.parse_line(line)
     bp.close_document()
Exemple #29
0
 def test_parse_document(self):
     bp = BlockParser(file_open(self.xml_path))
     blocks = bp.get_complete_blocks()
     while blocks:
         blocks = bp.get_complete_blocks()
     bp.close_document()
Exemple #30
0
 def test_parsing_alignment(self):
     bp = BlockParser(file_open(self.align_path))
     blocks = bp.get_complete_blocks()
     self.assertEqual(blocks[0].parent.name, 'linkGrp')
     self.assertEqual(blocks[0].attributes['xtargets'], 's1;s1')
     bp.close_document()