Exemple #1
0
 def process(self, ind, path):
     tok = Tokenizer()
     tok.tokenize_html(path)
     self._titles.append(tok.title or u'Untitled')
     for token, freq in tok._counter.items():
         # Normalize by unique tokens.
         self.globalht[token].append([ind, freq / len(tok._counter)])
Exemple #2
0
    def __init__(self, ds_name='nlvr2', split='train', task='ispeaker'):
        self.ds_name = ds_name
        self.split = split
        self.data = json.load(
            open(os.path.join(DATA_ROOT, self.ds_name, self.split + ".json"))
        )

        self.tok = Tokenizer()
        self.tok.load(os.path.join(DATA_ROOT, self.ds_name, "vocab.txt"))

        self.feat_mean = np.load(os.path.join(DATA_ROOT, self.ds_name, 'feat_mean.npy'))
        self.feat_std = np.load(os.path.join(DATA_ROOT, self.ds_name, 'feat_std.npy'))
Exemple #3
0
    def top(self, query, num):
        tok = Tokenizer()
        tok.tokenize(query)

        acc = Counter()
        for token in tok._counter:
            dict_entry = self.dict_entry(token)
            print token, dict_entry
            if not dict_entry:
                continue
            post_entries = self.post_entries(dict_entry[1], dict_entry[2])
            # Adjust weights so that we don't open files multiple times in
            # cases of repeated tokens. (e.g: dog cat dog cat dog)
            for docid in post_entries:
                post_entries[docid] *= tok._counter[token]
            acc.update(post_entries)
        return acc.most_common(num), len(acc)
Exemple #4
0
class TestTokenizerGenAlphaDigit(
        unittest.TestCase):  # class Tokenizer, method tokenize_gen_alpha_digit
    def setUp(self):
        self.t = Tokenizer()

    def test_empty_string(self):
        s = list(self.t.tokenize_gen_alpha_digit(''))
        self.assertEqual(len(s), 0)
        self.assertEqual(s, [])

    def test_no_spaces(self):
        s = list(self.t.tokenize_gen_alpha_digit('мамамылараму'))
        self.assertEqual(s, [Token('мамамылараму', 'alpha', 0, 12)])

    def test_first_alpha(self):
        s = list(self.t.tokenize_gen_alpha_digit('мама мыла'))
        self.assertEqual(s[0], Token("мама", "alpha", 0, 4))
        self.assertEqual(len(s), 4)
Exemple #5
0
class TestTokenizerGen(unittest.TestCase):
    def setUp(self):
        self.t = Tokenizer()

    def test_last_nonalpha(self):
        s = list(self.t.tokenize_gen('мамамылараму2'))
        self.assertEqual(len(s), 12)
        self.assertEqual(s[11], Token('мамамылараму', 'alpha', 0, 13))

    def test_first_alpha(self):
        s = list(self.t.tokenize_gen('я иду в кино'))
        self.assertEqual(s[0], Token("я", "alpha", 0, 1))
        self.assertEqual(len(s), 7)

    def test_empty_string(self):
        s = list(self.t.tokenize_gen(''))
        self.assertEqual(len(s), 0)
        self.assertEqual(s, [])

    def test_no_spaces(self):
        s = list(self.t.tokenize_gen('яидувкиноcinema'))
        self.assertEqual(s, [Token('яидувкиноcinema', 'alpha', 0, 15)])
        self.assertEqual(len(s), 1)

    def test_digital_string(self):
        s = list(self.t.tokenize_gen('012345'))
        self.assertEqual(len(s), 1)
        self.assertEqual(s, [Token('012345', 'digit', 0, 6)])

    def test_first_nonalpha(self):
        s = list(self.t.tokenize_gen('!!!!я иду в кино cinema'))
        self.assertEqual(len(s), 10)
        self.assertEqual(s[0], Token('!!!!', 'punct', 0, 4))

    def test_middle_nonapha(self):
        s = list(self.t.tokenize_gen('я иду в кино00000 111 00000cinema'))
        self.assertEqual(len(s), 13)
        self.assertEqual(s[7], Token('00000', 'digit', 12, 17))
        self.assertEqual(s[9], Token('111', 'digit', 18, 21))
import sys
from tok import Tokenizer
tok = Tokenizer()
for review in sys.stdin:
	print ' '.join(tok.tokenize(review))
    'nlvr2',
    'spotdiff',
    'adobe',
]

ds_root = "../dataset/"
for ds_name in DATASETS:
    print("Processing dataset %s" % ds_name)

    dataset = []
    for split_name in ['train', 'valid']:
        dataset.extend(
            json.load(
                open(os.path.join(ds_root, ds_name, split_name + ".json"))))
        print("Finish Loading split %s" % split_name)
    print("Number of data is %d." % len(dataset))
    sents = sum(map(lambda x: x["sents"], dataset), [])
    print("Number of sents is %d." % len(sents))

    tok = Tokenizer()
    tok.build_vocab(sents, min_occur=3)
    tok.dump(os.path.join(ds_root, ds_name, "vocab.txt"))

    wordXnum = list(tok.occur.items())
    wordXnum = sorted(wordXnum, key=lambda x: x[1], reverse=True)
    N = 50
    print("Top %d Words:" % N)
    for word, num in wordXnum[:N]:
        print("%s: %d" % (word, num))
    print()
Exemple #8
0
        op_code = (w >> 12) & 0o77
        op_addr = w & 0xFFF
        if w & (1 << 18) != 0:  # address is extended
            op_addr |= 0o70000
    else:  # long address command
        op_code = ((w >> 15) & 0o37) + 48
        op_addr = w & 0o77777
    if op_indx == 0:
        return f"{op_names[op_code]} {op_addr:>05o}"
    else:
        return f"{op_names[op_code]} {op_addr:>05o},M{op_indx:o}"


if __name__ == '__main__':
    source = open(input_file).read()
    t = Tokenizer(source)

    PC = 0
    DP = 0
    irom = array.array('Q', [0xFFFFFFFFFFFFFFFF] * 65536)
    dram = array.array('Q', [0xFFFFFFFFFFFFFFFF] * 32768)

    fix_list = []  # tuples(pc, 'name', offset)
    names = {}
    bss_size = 0

    error_count = 0
    while not t.eof:
        if keyword := t.get('IDENT'):
            kwrd = keyword.val.lower()
            line = keyword.line
Exemple #9
0
import re
from collections import Counter
import just
import requests
from nearnlp.nearnlp import is_noun, is_verb, singularize
import functools
from tok import Tokenizer
from nltk.corpus import stopwords
from nostalgia.enrichers.google.custom_search import google_custom_search

ENGLISH_STOP = set(stopwords.words("english"))

t = Tokenizer(True)
t.drop("<b>", "remove html")
t.drop("<b/>", "remove html")

# can also use qoogle

interesting_keys = set()
for prefix in ["og:", "twitter:", ""]:
    for key in [
        "title",
        "description",
        "name",
        "manufacturer_name",
        "category_name_singular",
        "long_description",
        "snippet",
    ]:
        interesting_keys.add(prefix + key)
Exemple #10
0
 def setUp(self):
     self.t = Tokenizer()
Exemple #11
0
 def process(self, path):
     tok = Tokenizer()
     tok.tokenize_html(path)
     self._counter.update(tok._counter)
     self.write_tokens(path, tok._counter.keys())