Esempio n. 1
0
    def _example_dict_iter(self, line, index):
        line = line.split()
        if self.line_truncate:
            line = line[:self.line_truncate]
        words, feats, n_feats = TextDataset.extract_text_features(line)
        example_dict = {self.side: words, "indices": index}
        if feats:
            # All examples must have same number of features.
            aeq(self.n_feats, n_feats)

            prefix = self.side + "_feat_"
            example_dict.update((prefix + str(j), f)
                                for j, f in enumerate(feats))

        return example_dict
Esempio n. 2
0
    def num_feats(self):
        """
        We peek the first line and seek back to
        the beginning of the file.
        """
        saved_pos = self.corpus.tell()

        line = self.corpus.readline().split()
        if self.line_truncate:
            line = line[:self.line_truncate]
        _, _, self.n_feats = TextDataset.extract_text_features(line)

        self.corpus.seek(saved_pos)

        return self.n_feats
Esempio n. 3
0
def get_num_features(src_data_type, corpus_file, side):
    """
    Args:
        src_data_type (str): ['text'|'img'|'audio']
        corpus_file (str): file path to get the features.
        side (str): src or tgt

    Returns:
        number of features on `side`.
    """
    assert side in ["src", "tgt"]
    assert src_data_type in ['text', 'img', 'audio'], \
        "Data type not implemented"
    if side == 'src' and src_data_type != 'text':
        return 0  # no features for non-text
    else:
        with codecs.open(corpus_file, "r", "utf-8") as f:
            line = f.readline().strip().split()
            _, _, n_feats = TextDataset.extract_text_features(line)
            return n_feats