Esempio n. 1
0
    def _create_space(cls, fname, **kwargs):
        """Create the space from a file of json

        :param fname: Path to the file containing the json
        :type fname: str
        :param kwargs: Keywords pass to TextModel
        """
        import random
        from .utils import linearSVC_array
        from collections import Counter
        try:
            from tqdm import tqdm
        except ImportError:

            def tqdm(x, **kwargs):
                return x

        data = [x for x in tweet_iterator(fname)]
        random.shuffle(data)
        tm = TextModel(**kwargs).fit([x['text'] for x in data[:128000]])
        tm._num_terms = tm.model.num_terms
        # klass, nele = np.unique([x['klass'] for x in data], return_counts=True)
        _ = [(k, v) for k, v in Counter([x['klass'] for x in data]).items()]
        _.sort(key=lambda x: x[0])
        klass = [x[0] for x in _]
        nele = [x[1] for x in _]
        h = {v: k for k, v in enumerate(klass)}
        MODELS = []
        for ident, k in tqdm(enumerate(klass)):
            elepklass = [0 for __ in klass]
            cnt = nele[ident]
            cntpklass = int(cnt / (len(klass) - 1))
            D = [(x, 1) for x in data if x['klass'] == k]
            for x in data:
                if x['klass'] == k:
                    continue
                if elepklass[h[x['klass']]] > cntpklass:
                    continue
                elepklass[h[x['klass']]] = elepklass[h[x['klass']]] + 1
                D.append((x, -1))
            m = LinearSVC().fit(tm.tonp([tm[x[0]['text']] for x in D]),
                                [x[1] for x in D])
            MODELS.append(m)
        coef, intercept = linearSVC_array(MODELS)
        return tm, coef, intercept, klass
Esempio n. 2
0
class Corpus(BaseTextModel):
    """Text model using only words"""
    def __init__(self, corpus=None, **kwargs):
        self._text = os.getenv('TEXT', default='text')
        self._m = {}
        self._num_terms = 0
        self._training = True
        self._textModel = TextModel([''], token_list=[-1])
        if corpus is not None:
            self.fit(corpus)

    def get_text(self, text):
        return text[self._text]

    def fit(self, c):
        [self.__getitem__(x) for x in c]
        self._training = False
        return self

    @property
    def num_terms(self):
        return self._num_terms

    def tokenize(self, text):
        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(self._textModel.tokenize(_text))
            return tokens
        else:
            return self._textModel.tokenize(text)

    def transform(self, texts):
        """Convert test into a vector

        :param texts: List of text to be transformed
        :type text: list

        :rtype: list

        Example:

        >>> from microtc.textmodel import TextModel
        >>> corpus = ['buenos dias catedras', 'catedras conacyt']
        >>> textmodel = TextModel().fit(corpus)
        >>> X = textmodel.transform(corpus)
        """
        return self._textModel.tonp([self.__getitem__(x) for x in texts])

    def __getitem__(self, d):
        tokens = []
        for t in self.tokenize(d):
            try:
                index, k = self._m[t]
                if self._training:
                    self._m[t] = [index, k + 1]
            except KeyError:
                if not self._training:
                    continue
                index, k = self._num_terms, 1
                self._m[t] = [index, k]
                self._num_terms += 1
            tokens.append([index, k])
        return tokens