Beispiel #1
0
    def _binarize_file_chunk(
        binarizer: Binarizer,
        filename: str,
        offset_start: int,
        offset_end: int,
        output_prefix: str,
        dataset_impl: str,
        vocab_size=None,
    ) -> tp.Tuple[tp.Any, BinarizeSummary]:  # (dataset builder, BinarizeSummary)
        """
        creates a dataset builder and append binarized items to it. This function does not
        finalize the builder, this is useful if you want to do other things with your bin file
        like appending/merging other files
        """
        bin_file = indexed_dataset.data_file_path(output_prefix)
        ds = indexed_dataset.make_builder(
            bin_file,
            impl=dataset_impl,
            vocab_size=vocab_size,
        )
        summary = BinarizeSummary()

        with Chunker(
            PathManager.get_local_path(filename), offset_start, offset_end
        ) as line_iterator:
            for line in line_iterator:
                ds.add_item(binarizer.binarize_line(line, summary))

        return ds, summary
Beispiel #2
0
 def _add_file_to_dictionary_single_worker(
     filename,
     tokenize,
     eos_word,
     start_offset,
     end_offset,
 ):
     counter = Counter()
     with Chunker(filename, start_offset, end_offset) as line_iterator:
         for line in line_iterator:
             for word in tokenize(line):
                 counter.update([word])
             counter.update([eos_word])
     return counter
Beispiel #3
0
    def test_readchunks(self):
        from fairseq.file_chunker_utils import Chunker, find_offsets

        offsets = find_offsets(self._tmpfile, self._num_splits)
        for start, end in zip(offsets, offsets[1:]):
            with Chunker(self._tmpfile, start, end) as lines:
                all_lines = list(lines)
                num_lines = self._num_lines / self._num_splits
                self.assertAlmostEqual(
                    len(all_lines), num_lines, delta=1
                )  # because we split on the bites, we might end up with one more/less line in a chunk
                self.assertListEqual(
                    all_lines,
                    [self._line_content for _ in range(len(all_lines))])
Beispiel #4
0
    def binarize_alignments(filename,
                            alignment_parser,
                            consumer,
                            offset=0,
                            end=-1) -> Dict[str, int]:
        nseq = 0

        with Chunker(PathManager.get_local_path(filename), offset,
                     end) as line_iterator:
            for line in line_iterator:
                ids = alignment_parser(line)
                nseq += 1
                consumer(ids)
        return {"nseq": nseq}
Beispiel #5
0
    def binarize(
        filename,
        dict,
        consumer,
        tokenize=tokenize_line,
        append_eos=True,
        reverse_order=False,
        offset=0,
        end=-1,
        already_numberized=False,
    ) -> Dict[str, int]:
        nseq, ntok = 0, 0
        replaced = Counter()

        def replaced_consumer(word, idx):
            if idx == dict.unk_index and word != dict.unk_word:
                replaced.update([word])

        with Chunker(PathManager.get_local_path(filename), offset,
                     end) as line_iterator:
            for line in line_iterator:
                if already_numberized:
                    id_strings = line.strip().split()
                    id_list = [int(id_string) for id_string in id_strings]
                    if reverse_order:
                        id_list.reverse()
                    if append_eos:
                        id_list.append(dict.eos())
                    ids = torch.IntTensor(id_list)
                else:
                    ids = dict.encode_line(
                        line=line,
                        line_tokenizer=tokenize,
                        add_if_not_exist=False,
                        consumer=replaced_consumer,
                        append_eos=append_eos,
                        reverse_order=reverse_order,
                    )
                nseq += 1
                ntok += len(ids)
                consumer(ids)
        return {
            "nseq": nseq,
            "nunk": sum(replaced.values()),
            "ntok": ntok,
            "replaced": replaced,
        }
Beispiel #6
0
    def _consume_file(
        filename: str,
        binarizer: Binarizer,
        consumer: tp.Callable[[torch.IntTensor], None],
        offset_start: int,
        offset_end: int,
    ) -> tp.Dict[str, int]:
        summary = BinarizeSummary()

        with Chunker(
            PathManager.get_local_path(filename), offset_start, offset_end
        ) as line_iterator:
            for line in line_iterator:
                consumer(binarizer.binarize_line(line, summary))

        return {
            "nseq": summary.num_seq,
            "nunk": summary.num_replaced,
            "ntok": summary.num_tok,
            "replaced": summary.replaced,
        }