Ejemplo n.º 1
0
class Corpus(BaseTextModel):
    """Text model using only words"""
    def __init__(self, corpus, **kwargs):
        self._text = os.getenv('TEXT', default='text')
        self._m = {}
        self._num_terms = 0
        self._training = True
        self._textModel = TextModel([''], token_list=[-1])
        self.fit(corpus)

    def get_text(self, text):
        return text[self._text]

    def fit(self, c):
        r = [self.__getitem__(x) for x in c]
        self._training = False
        return r

    @property
    def num_terms(self):
        return self._num_terms

    def tokenize(self, text):
        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(self._textModel.tokenize(_text))
            return tokens
        else:
            return self._textModel.tokenize(text)

    def __getitem__(self, d):
        tokens = []
        for t in self.tokenize(d):
            try:
                index, k = self._m[t]
                if self._training:
                    self._m[t] = [index, k + 1]
            except KeyError:
                if not self._training:
                    continue
                index, k = self._num_terms, 1
                self._m[t] = [index, k]
                self._num_terms += 1
            tokens.append([index, k])
        return tokens
Ejemplo n.º 2
0
def test_lang():
    from b4msa.textmodel import TextModel

    text = ["Hi :) :P XD", "excelente dia xc", "el alma de la fiesta XD"]
    model = TextModel(
        text,
        **{
            "del_dup": True,
            "emo_option": "group",
            "lc": True,
            "negation": True,
            "num_option": "group",
            "stemming": True,
            "stopwords": "group",
            "del_diac": False,
            "token_list": [
                -1,
                # 5,
            ],
            "url_option": "group",
            "usr_option": "group",
            "lang": "spanish",
        })
    text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    print(model.tokenize)
    a = model.tokenize(text)
    b = [
        '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw',
        'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda'
    ]
    print(text)
    assert a == b, "got: {0}, expected: {1}".format(a, b)
Ejemplo n.º 3
0
def test_negations_italian():
    from b4msa.textmodel import TextModel

    text = ["XD"]

    model = TextModel(
        text, **{
            'num_option': 'group',
            'del_diac': False,
            'stopwords': 'delete',
            'negation': True,
            'stemming': True,
            'lc': False,
            'token_list': [-1],
            'usr_option': 'group',
            'del_dup': False,
            'emo_option': 'group',
            'lang': 'italian',
            'url_option': 'delete'
        })

    text = """@User Come non condividere; me ne frega niente"""
    a = model.tokenize(text)
    print("Input:", text)
    print("Output:", a)
    b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient']
    assert a == b
Ejemplo n.º 4
0
def test_negations():
    from b4msa.textmodel import TextModel

    text = ["el alma de la fiesta XD"]
    model = TextModel(
        text, **{
            'num_option': 'group',
            'del_diac': False,
            'stopwords': 'delete',
            'negation': True,
            'stemming': True,
            'lc': False,
            'token_list': [-1],
            'usr_option': 'group',
            'del_dup': False,
            'emo_option': 'group',
            'lang': 'spanish',
            'url_option': 'delete'
        })

    text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto"""
    a = model.tokenize(text)
    b = [
        '_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech',
        'no_respect'
    ]
    print(a, b)
    assert a == b
Ejemplo n.º 5
0
def test_lang():
    from b4msa.textmodel import TextModel

    text = [
        "Hi :) :P XD",
        "excelente dia xc",
        "el alma de la fiesta XD"
    ]
    model = TextModel(text, **{
        "del_dup": True,
        "emo_option": "group",
        "lc": True,
        "negation": True,
        "num_option": "group",
        "stemming": True,
        "stopwords": "group",
        "del_diac": False,
        "token_list": [
            -1,
            # 5,
        ],
        "url_option": "group",
        "usr_option": "group",
        "lang": "spanish",
    })
    text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    print(model.tokenize)
    a = model.tokenize(text)
    b = ['_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw', 'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda']
    print(text)
    assert a == b, "got: {0}, expected: {1}".format(a, b)
Ejemplo n.º 6
0
def test_negations_italian():
    from b4msa.textmodel import TextModel

    text = [
        "XD"
    ]

    model = TextModel(text, **{
        'num_option': 'group',
        'del_diac': False,
        'stopwords': 'delete',
        'negation': True,
        'stemming': True,
        'lc': False, 'token_list': [-1],
        'usr_option': 'group',
        'del_dup': False,
        'emo_option': 'group',
        'lang': 'italian',
        'url_option': 'delete'
    })

    text = """@User Come non condividere; me ne frega niente"""
    a = model.tokenize(text)
    print("Input:", text)
    print("Output:", a)
    b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient']
    assert a == b
Ejemplo n.º 7
0
    def tokenize(self, text):
        """Tokenize a text

        :param text: Text
        :type text: dict or str
        """

        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(TextModel.tokenize(self, _text))
            return tokens
        else:
            return TextModel.tokenize(self, text)
Ejemplo n.º 8
0
def test_negations():
    from b4msa.textmodel import TextModel

    text = [
        "el alma de la fiesta XD"
    ]
    model = TextModel(text, **{
        'num_option': 'group',
        'del_diac': False,
        'stopwords': 'delete',
        'negation': True,
        'stemming': True,
        'lc': False, 'token_list': [-1],
        'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'spanish', 'url_option': 'delete'
    })

    text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto"""
    a = model.tokenize(text)
    b = ['_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech', 'no_respect']
    print(a, b)
    assert a == b
Ejemplo n.º 9
0
def test_lang():
    from b4msa.textmodel import TextModel

    #text = [
    #    "Hi :) :P XD",
    #    "excelente dia xc",
    #    "el alma de la fiesta XD"
    #]
    text = [
        "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :("
    ]

    model = TextModel(
        text,
        **{
            "del_dup1": True,
            "emo_option": "group",
            "lc": True,
            "negation": True,
            "num_option": "group",
            "stemming": True,
            "stopwords": "group",
            "strip_diac": True,
            "token_list": [
                -1,
                # 5,
            ],
            "url_option": "group",
            "usr_option": "group",
            "lang": "portuguese",
        })
    #text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    text = "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :("
    a = model.tokenize(text)
    b = [
        '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw',
        'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda'
    ]
    print a
Ejemplo n.º 10
0
ci

D = list(tweet_iterator('../../../datasets/semeval/semeval2017_En_train.json'))
tm = TextModel(token_list=[-1]).fit(D)

id2word = {v:k for k, v in tm.model.word2id.items()}
_ = {id2word[k]:v for k, v in tm.model.wordWeight.items()}

wc = WC().generate_from_frequencies(_)
plt.imshow(wc)
plt.axis('off')
plt.tight_layout()
plt.savefig('semeval2017_idf.png', dpi=300)

cnt = Counter()
_ = [cnt.update(tm.tokenize(x)) for x in D]
wc = WC().generate_from_frequencies(cnt)
plt.imshow(wc)
plt.axis('off')
plt.tight_layout()
plt.savefig('semeval2017_tf.png', dpi=300)


perf = load_model('dataset/performance.gz')
algs = list(perf.keys())
datasets = list(perf[algs[0]][1].keys())
data = []
for alg in algs:
    _ = [perf[alg][1][dataset] for dataset in datasets]
    data.append(_)
data = np.array(data)
Ejemplo n.º 11
0
class Corpus(BaseTextModel):
    """Text model using only words"""
    def __init__(self, corpus=None, **kwargs):
        self._text = os.getenv('TEXT', default='text')
        self._m = {}
        self._num_terms = 0
        self._training = True
        self._textModel = TextModel([''], token_list=[-1])
        if corpus is not None:
            self.fit(corpus)

    def get_text(self, text):
        return text[self._text]

    def fit(self, c):
        [self.__getitem__(x) for x in c]
        self._training = False
        return self

    @property
    def num_terms(self):
        return self._num_terms

    def tokenize(self, text):
        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(self._textModel.tokenize(_text))
            return tokens
        else:
            return self._textModel.tokenize(text)

    def transform(self, texts):
        """Convert test into a vector

        :param texts: List of text to be transformed
        :type text: list

        :rtype: list

        Example:

        >>> from microtc.textmodel import TextModel
        >>> corpus = ['buenos dias catedras', 'catedras conacyt']
        >>> textmodel = TextModel().fit(corpus)
        >>> X = textmodel.transform(corpus)
        """
        return self._textModel.tonp([self.__getitem__(x) for x in texts])

    def __getitem__(self, d):
        tokens = []
        for t in self.tokenize(d):
            try:
                index, k = self._m[t]
                if self._training:
                    self._m[t] = [index, k + 1]
            except KeyError:
                if not self._training:
                    continue
                index, k = self._num_terms, 1
                self._m[t] = [index, k]
                self._num_terms += 1
            tokens.append([index, k])
        return tokens
Ejemplo n.º 12
0
output = []
for word in text.split():
    w = stemmer.stem(word)
    output.append(w)
output = " ".join(output)
output

text = 'I like playing football on Saturday'
words = text.split()
n = 3
n_grams = []
for a in zip(*[words[i:] for i in range(n)]):
    n_grams.append("~".join(a))
n_grams

text = 'I like playing'
q = 4
q_grams = []
for a in zip(*[text[i:] for i in range(q)]):
    q_grams.append("".join(a))
q_grams

text = 'I like playing football with @mgraffg'
tm = TextModel(token_list=[-1, 5],
               lang='english',
               usr_option=OPTION_GROUP,
               stemming=True)
tm.text_transformations(text)

tm.tokenize(text)