class Featurizer: def __init__(self, max_sample_per_class, max_lines=0, skip_duplicates=True, tolower=False, include_none_labels=False, label_extractor=None): self.dataset = DataSet( max_sample_per_class=max_sample_per_class, skip_duplicates=skip_duplicates, ) self.label_extractor = LabelExtractor(label_extractor) self._include_none_labels = include_none_labels self._max_lines = max_lines self._line_cnt = 0 self._tolower = tolower def featurize_stream(self, stream): for line in stream: if self.continue_reading() is False: break try: sample = self.extract_sample_from_line(line) except InvalidInput: continue self.dataset.add_sample(sample) self.featurize_sample(sample) def continue_reading(self): self._line_cnt += 1 if self._max_lines > 0 and self._line_cnt > self._max_lines: return False return not self.dataset.full def extract_sample_from_line(self, line): # TODO this is WebCorpus specific, it should be in a separate class if not line.strip() or 'UNKNOWN' in line or '??' in line: raise InvalidLine() fd = line.strip().split('\t') if len(fd) < 2: raise InvalidLine("Not enough fields.") word, tag = fd[:2] label = self.label_extractor(tag) if label is None: if self._include_none_labels: label = "OTHER" else: raise InvalidTag() if self._tolower: word = word.lower() return Sample(word, label) def featurize_sample(self, sample): sample.features = {'word': sample.sample} def get_samples(self): return self.dataset.samples @property def X(self): return self.dataset.X @property def y(self): return self.dataset.y