Example #1
0
def test_counter_json():
    from microtc.utils import Counter
    c1 = Counter()
    c1.update(list(map(str, range(10))))
    print(c1)
    js = c1.tojson()
    print(js)
    c2 = Counter.fromjson(js)
    assert c1.update_calls == c2.update_calls
    print(c2)
    for x, v in c1.items():
        print(x, v, c2[x])
        assert x in c2 and v == c2[x]
Example #2
0
def test_counter_sub():
    from microtc.utils import Counter

    c1 = Counter()
    c1.update(range(10))
    c2 = Counter()
    c2.update(range(5, 15))
    r = c1 + c2
    re = r - c1
    print(re)
    assert isinstance(re, Counter)
    for k, v in re.items():
        assert c2[k] == v
    for k, v in c2.items():
        assert re[k] == v
    assert re.update_calls == 1
Example #3
0
def test_tfidf_corpus():
    from nose.tools import assert_almost_equals
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import Counter
    from microtc.utils import tweet_iterator
    import os
    import numpy as np
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(token_list=[-1, 3])
    docs = [text.tokenize(d) for d in docs]
    counter = Counter()
    [counter.update(set(x))for x in docs]
    tfidf = TFIDF(docs)
    tfidf2 = TFIDF.counter(counter)
    assert tfidf.num_terms == tfidf2.num_terms
    assert tfidf._ndocs == tfidf2._ndocs
    for k in tfidf2.word2id.keys():
        assert k in tfidf2.word2id
    for k, v in tfidf.word2id.items():
        id2 = tfidf2.word2id[k]
        v = tfidf.wordWeight[v]
        v2 = tfidf2.wordWeight[id2]
        print(v, v2, k)
        assert_almost_equals(v, v2)
Example #4
0
    def available_dates(dates=List, n=int, countries=List, lang=str):
        """Retrieve the first n dates available for all the countries

        :param dates: List of dates
        :param n: Number of days
        :param countries: List of countries
        :lang lang: Language
        """

        missing = Counter(countries) if countries is not 'nogeo' else None
        rest = []
        dates = dates[::-1]
        while len(dates) and (len(rest) < n or n == -1):
            day = dates.pop()
            flag = True
            iter = missing.most_common() if missing is not None else [[
                None, None
            ]]
            for country, _ in iter:
                try:
                    download_tokens(
                        day,
                        lang=lang,
                        country=country if country is not None else 'nogeo')
                except Exception:
                    flag = False
                    if missing is not None:
                        missing.update([country])
                    break
            if flag:
                rest.append(day)
        return rest
Example #5
0
def test_counter_add():
    from microtc.utils import Counter

    c1 = Counter()
    c1.update(range(10))
    c2 = Counter()
    c2.update(range(5, 15))
    r = c1 + c2
    print(r)
    assert isinstance(r, Counter)
    for i in range(5):
        assert r[i] == 1
    for i in range(5, 10):
        assert r[i] == 2
    for i in range(10, 15):
        assert r[i] == 1 
    assert r.update_calls == 2
Example #6
0
    def fit(self, X: List[Union[str, dict]]) -> 'BagOfWords':
        """ Train the Bag of words model"""

        from microtc.utils import Counter
        cnt = Counter()
        tokens = self.tokenize.transform([x for x in X])
        [cnt.update(x) for x in tokens]
        self._tfidf = TFIDF.counter(cnt)
        return self
Example #7
0
def test_counter():
    from microtc.utils import Counter, save_model, load_model
    import os
    c = Counter()
    c.update([1, 2, 3, 1])
    c.update([3])
    assert c[1] == 2
    print(c.update_calls)
    assert c.update_calls == 2
    save_model(c, "t.voc")
    cc = load_model("t.voc")
    os.unlink("t.voc")
    print(cc.update_calls, "**")
    assert cc.update_calls ==  2
Example #8
0
def test_tfidf_corpus2():
    from nose.tools import assert_almost_equals
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import Counter
    from microtc.utils import tweet_iterator
    import os
    import numpy as np
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    tm = TextModel(token_list=[-1, 3])
    docs = [tm.tokenize(d) for d in docs]
    counter = Counter()
    [counter.update(set(x))for x in docs]
    tfidf = TFIDF(docs, token_min_filter=1)
    tfidf2 = TFIDF.counter(counter, token_min_filter=1)
    id2w2 = {v: k for k, v in tfidf2.word2id.items()}
    for text in docs:
        tokens = tm.tokenize(text)
        fm = {k: v for k, v in tfidf[tokens]}
        for k, v in tfidf2[tokens]:
            assert_almost_equals(fm[tfidf.word2id[id2w2[k]]], v)
Example #9
0
 def __init__(
         self, tokenizer: Callable[[Union[str, dict]],
                                   Iterable[str]]) -> None:
     self._tokenizer = tokenizer
     self._counter = Counter()