def process(self, ind, path): tok = Tokenizer() tok.tokenize_html(path) self._titles.append(tok.title or u'Untitled') for token, freq in tok._counter.items(): # Normalize by unique tokens. self.globalht[token].append([ind, freq / len(tok._counter)])
def __init__(self, ds_name='nlvr2', split='train', task='ispeaker'): self.ds_name = ds_name self.split = split self.data = json.load( open(os.path.join(DATA_ROOT, self.ds_name, self.split + ".json")) ) self.tok = Tokenizer() self.tok.load(os.path.join(DATA_ROOT, self.ds_name, "vocab.txt")) self.feat_mean = np.load(os.path.join(DATA_ROOT, self.ds_name, 'feat_mean.npy')) self.feat_std = np.load(os.path.join(DATA_ROOT, self.ds_name, 'feat_std.npy'))
def top(self, query, num): tok = Tokenizer() tok.tokenize(query) acc = Counter() for token in tok._counter: dict_entry = self.dict_entry(token) print token, dict_entry if not dict_entry: continue post_entries = self.post_entries(dict_entry[1], dict_entry[2]) # Adjust weights so that we don't open files multiple times in # cases of repeated tokens. (e.g: dog cat dog cat dog) for docid in post_entries: post_entries[docid] *= tok._counter[token] acc.update(post_entries) return acc.most_common(num), len(acc)
class TestTokenizerGenAlphaDigit( unittest.TestCase): # class Tokenizer, method tokenize_gen_alpha_digit def setUp(self): self.t = Tokenizer() def test_empty_string(self): s = list(self.t.tokenize_gen_alpha_digit('')) self.assertEqual(len(s), 0) self.assertEqual(s, []) def test_no_spaces(self): s = list(self.t.tokenize_gen_alpha_digit('мамамылараму')) self.assertEqual(s, [Token('мамамылараму', 'alpha', 0, 12)]) def test_first_alpha(self): s = list(self.t.tokenize_gen_alpha_digit('мама мыла')) self.assertEqual(s[0], Token("мама", "alpha", 0, 4)) self.assertEqual(len(s), 4)
class TestTokenizerGen(unittest.TestCase): def setUp(self): self.t = Tokenizer() def test_last_nonalpha(self): s = list(self.t.tokenize_gen('мамамылараму2')) self.assertEqual(len(s), 12) self.assertEqual(s[11], Token('мамамылараму', 'alpha', 0, 13)) def test_first_alpha(self): s = list(self.t.tokenize_gen('я иду в кино')) self.assertEqual(s[0], Token("я", "alpha", 0, 1)) self.assertEqual(len(s), 7) def test_empty_string(self): s = list(self.t.tokenize_gen('')) self.assertEqual(len(s), 0) self.assertEqual(s, []) def test_no_spaces(self): s = list(self.t.tokenize_gen('яидувкиноcinema')) self.assertEqual(s, [Token('яидувкиноcinema', 'alpha', 0, 15)]) self.assertEqual(len(s), 1) def test_digital_string(self): s = list(self.t.tokenize_gen('012345')) self.assertEqual(len(s), 1) self.assertEqual(s, [Token('012345', 'digit', 0, 6)]) def test_first_nonalpha(self): s = list(self.t.tokenize_gen('!!!!я иду в кино cinema')) self.assertEqual(len(s), 10) self.assertEqual(s[0], Token('!!!!', 'punct', 0, 4)) def test_middle_nonapha(self): s = list(self.t.tokenize_gen('я иду в кино00000 111 00000cinema')) self.assertEqual(len(s), 13) self.assertEqual(s[7], Token('00000', 'digit', 12, 17)) self.assertEqual(s[9], Token('111', 'digit', 18, 21))
import sys from tok import Tokenizer tok = Tokenizer() for review in sys.stdin: print ' '.join(tok.tokenize(review))
'nlvr2', 'spotdiff', 'adobe', ] ds_root = "../dataset/" for ds_name in DATASETS: print("Processing dataset %s" % ds_name) dataset = [] for split_name in ['train', 'valid']: dataset.extend( json.load( open(os.path.join(ds_root, ds_name, split_name + ".json")))) print("Finish Loading split %s" % split_name) print("Number of data is %d." % len(dataset)) sents = sum(map(lambda x: x["sents"], dataset), []) print("Number of sents is %d." % len(sents)) tok = Tokenizer() tok.build_vocab(sents, min_occur=3) tok.dump(os.path.join(ds_root, ds_name, "vocab.txt")) wordXnum = list(tok.occur.items()) wordXnum = sorted(wordXnum, key=lambda x: x[1], reverse=True) N = 50 print("Top %d Words:" % N) for word, num in wordXnum[:N]: print("%s: %d" % (word, num)) print()
op_code = (w >> 12) & 0o77 op_addr = w & 0xFFF if w & (1 << 18) != 0: # address is extended op_addr |= 0o70000 else: # long address command op_code = ((w >> 15) & 0o37) + 48 op_addr = w & 0o77777 if op_indx == 0: return f"{op_names[op_code]} {op_addr:>05o}" else: return f"{op_names[op_code]} {op_addr:>05o},M{op_indx:o}" if __name__ == '__main__': source = open(input_file).read() t = Tokenizer(source) PC = 0 DP = 0 irom = array.array('Q', [0xFFFFFFFFFFFFFFFF] * 65536) dram = array.array('Q', [0xFFFFFFFFFFFFFFFF] * 32768) fix_list = [] # tuples(pc, 'name', offset) names = {} bss_size = 0 error_count = 0 while not t.eof: if keyword := t.get('IDENT'): kwrd = keyword.val.lower() line = keyword.line
import re from collections import Counter import just import requests from nearnlp.nearnlp import is_noun, is_verb, singularize import functools from tok import Tokenizer from nltk.corpus import stopwords from nostalgia.enrichers.google.custom_search import google_custom_search ENGLISH_STOP = set(stopwords.words("english")) t = Tokenizer(True) t.drop("<b>", "remove html") t.drop("<b/>", "remove html") # can also use qoogle interesting_keys = set() for prefix in ["og:", "twitter:", ""]: for key in [ "title", "description", "name", "manufacturer_name", "category_name_singular", "long_description", "snippet", ]: interesting_keys.add(prefix + key)
def setUp(self): self.t = Tokenizer()
def process(self, path): tok = Tokenizer() tok.tokenize_html(path) self._counter.update(tok._counter) self.write_tokens(path, tok._counter.keys())