Exemple #1
0
class Featurizer:
    def __init__(self, max_sample_per_class,
                 max_lines=0, skip_duplicates=True,
                 tolower=False, include_none_labels=False,
                 label_extractor=None):
        self.dataset = DataSet(
            max_sample_per_class=max_sample_per_class,
            skip_duplicates=skip_duplicates,
        )
        self.label_extractor = LabelExtractor(label_extractor)
        self._include_none_labels = include_none_labels
        self._max_lines = max_lines
        self._line_cnt = 0
        self._tolower = tolower

    def featurize_stream(self, stream):
        for line in stream:
            if self.continue_reading() is False:
                break
            try:
                sample = self.extract_sample_from_line(line)
            except InvalidInput:
                continue
            self.dataset.add_sample(sample)
            self.featurize_sample(sample)

    def continue_reading(self):
        self._line_cnt += 1
        if self._max_lines > 0 and self._line_cnt > self._max_lines:
            return False
        return not self.dataset.full

    def extract_sample_from_line(self, line):
        # TODO this is WebCorpus specific, it should be in a separate class
        if not line.strip() or 'UNKNOWN' in line or '??' in line:
            raise InvalidLine()
        fd = line.strip().split('\t')
        if len(fd) < 2:
            raise InvalidLine("Not enough fields.")
        word, tag = fd[:2]
        label = self.label_extractor(tag)
        if label is None:
            if self._include_none_labels:
                label = "OTHER"
            else:
                raise InvalidTag()
        if self._tolower:
            word = word.lower()
        return Sample(word, label)

    def featurize_sample(self, sample):
        sample.features = {'word': sample.sample}

    def get_samples(self):
        return self.dataset.samples

    @property
    def X(self):
        return self.dataset.X

    @property
    def y(self):
        return self.dataset.y