def test_counter_json(): from microtc.utils import Counter c1 = Counter() c1.update(list(map(str, range(10)))) print(c1) js = c1.tojson() print(js) c2 = Counter.fromjson(js) assert c1.update_calls == c2.update_calls print(c2) for x, v in c1.items(): print(x, v, c2[x]) assert x in c2 and v == c2[x]
def test_counter_sub(): from microtc.utils import Counter c1 = Counter() c1.update(range(10)) c2 = Counter() c2.update(range(5, 15)) r = c1 + c2 re = r - c1 print(re) assert isinstance(re, Counter) for k, v in re.items(): assert c2[k] == v for k, v in c2.items(): assert re[k] == v assert re.update_calls == 1
def test_tfidf_corpus(): from nose.tools import assert_almost_equals from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import Counter from microtc.utils import tweet_iterator import os import numpy as np fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] text = TextModel(token_list=[-1, 3]) docs = [text.tokenize(d) for d in docs] counter = Counter() [counter.update(set(x))for x in docs] tfidf = TFIDF(docs) tfidf2 = TFIDF.counter(counter) assert tfidf.num_terms == tfidf2.num_terms assert tfidf._ndocs == tfidf2._ndocs for k in tfidf2.word2id.keys(): assert k in tfidf2.word2id for k, v in tfidf.word2id.items(): id2 = tfidf2.word2id[k] v = tfidf.wordWeight[v] v2 = tfidf2.wordWeight[id2] print(v, v2, k) assert_almost_equals(v, v2)
def available_dates(dates=List, n=int, countries=List, lang=str): """Retrieve the first n dates available for all the countries :param dates: List of dates :param n: Number of days :param countries: List of countries :lang lang: Language """ missing = Counter(countries) if countries is not 'nogeo' else None rest = [] dates = dates[::-1] while len(dates) and (len(rest) < n or n == -1): day = dates.pop() flag = True iter = missing.most_common() if missing is not None else [[ None, None ]] for country, _ in iter: try: download_tokens( day, lang=lang, country=country if country is not None else 'nogeo') except Exception: flag = False if missing is not None: missing.update([country]) break if flag: rest.append(day) return rest
def test_counter_add(): from microtc.utils import Counter c1 = Counter() c1.update(range(10)) c2 = Counter() c2.update(range(5, 15)) r = c1 + c2 print(r) assert isinstance(r, Counter) for i in range(5): assert r[i] == 1 for i in range(5, 10): assert r[i] == 2 for i in range(10, 15): assert r[i] == 1 assert r.update_calls == 2
def fit(self, X: List[Union[str, dict]]) -> 'BagOfWords': """ Train the Bag of words model""" from microtc.utils import Counter cnt = Counter() tokens = self.tokenize.transform([x for x in X]) [cnt.update(x) for x in tokens] self._tfidf = TFIDF.counter(cnt) return self
def test_counter(): from microtc.utils import Counter, save_model, load_model import os c = Counter() c.update([1, 2, 3, 1]) c.update([3]) assert c[1] == 2 print(c.update_calls) assert c.update_calls == 2 save_model(c, "t.voc") cc = load_model("t.voc") os.unlink("t.voc") print(cc.update_calls, "**") assert cc.update_calls == 2
def test_tfidf_corpus2(): from nose.tools import assert_almost_equals from microtc.textmodel import TextModel from microtc.weighting import TFIDF from microtc.utils import Counter from microtc.utils import tweet_iterator import os import numpy as np fname = join(os.path.dirname(__file__), 'text.json') tw = list(tweet_iterator(fname)) docs = [x['text'] for x in tw] tm = TextModel(token_list=[-1, 3]) docs = [tm.tokenize(d) for d in docs] counter = Counter() [counter.update(set(x))for x in docs] tfidf = TFIDF(docs, token_min_filter=1) tfidf2 = TFIDF.counter(counter, token_min_filter=1) id2w2 = {v: k for k, v in tfidf2.word2id.items()} for text in docs: tokens = tm.tokenize(text) fm = {k: v for k, v in tfidf[tokens]} for k, v in tfidf2[tokens]: assert_almost_equals(fm[tfidf.word2id[id2w2[k]]], v)
def __init__( self, tokenizer: Callable[[Union[str, dict]], Iterable[str]]) -> None: self._tokenizer = tokenizer self._counter = Counter()