def _create_space(cls, fname, **kwargs): """Create the space from a file of json :param fname: Path to the file containing the json :type fname: str :param kwargs: Keywords pass to TextModel """ import random from .utils import linearSVC_array from collections import Counter try: from tqdm import tqdm except ImportError: def tqdm(x, **kwargs): return x data = [x for x in tweet_iterator(fname)] random.shuffle(data) tm = TextModel(**kwargs).fit([x['text'] for x in data[:128000]]) tm._num_terms = tm.model.num_terms # klass, nele = np.unique([x['klass'] for x in data], return_counts=True) _ = [(k, v) for k, v in Counter([x['klass'] for x in data]).items()] _.sort(key=lambda x: x[0]) klass = [x[0] for x in _] nele = [x[1] for x in _] h = {v: k for k, v in enumerate(klass)} MODELS = [] for ident, k in tqdm(enumerate(klass)): elepklass = [0 for __ in klass] cnt = nele[ident] cntpklass = int(cnt / (len(klass) - 1)) D = [(x, 1) for x in data if x['klass'] == k] for x in data: if x['klass'] == k: continue if elepklass[h[x['klass']]] > cntpklass: continue elepklass[h[x['klass']]] = elepklass[h[x['klass']]] + 1 D.append((x, -1)) m = LinearSVC().fit(tm.tonp([tm[x[0]['text']] for x in D]), [x[1] for x in D]) MODELS.append(m) coef, intercept = linearSVC_array(MODELS) return tm, coef, intercept, klass
class Corpus(BaseTextModel): """Text model using only words""" def __init__(self, corpus=None, **kwargs): self._text = os.getenv('TEXT', default='text') self._m = {} self._num_terms = 0 self._training = True self._textModel = TextModel([''], token_list=[-1]) if corpus is not None: self.fit(corpus) def get_text(self, text): return text[self._text] def fit(self, c): [self.__getitem__(x) for x in c] self._training = False return self @property def num_terms(self): return self._num_terms def tokenize(self, text): if isinstance(text, dict): text = self.get_text(text) if isinstance(text, (list, tuple)): tokens = [] for _text in text: tokens.extend(self._textModel.tokenize(_text)) return tokens else: return self._textModel.tokenize(text) def transform(self, texts): """Convert test into a vector :param texts: List of text to be transformed :type text: list :rtype: list Example: >>> from microtc.textmodel import TextModel >>> corpus = ['buenos dias catedras', 'catedras conacyt'] >>> textmodel = TextModel().fit(corpus) >>> X = textmodel.transform(corpus) """ return self._textModel.tonp([self.__getitem__(x) for x in texts]) def __getitem__(self, d): tokens = [] for t in self.tokenize(d): try: index, k = self._m[t] if self._training: self._m[t] = [index, k + 1] except KeyError: if not self._training: continue index, k = self._num_terms, 1 self._m[t] = [index, k] self._num_terms += 1 tokens.append([index, k]) return tokens