def __init__(self, ds_name='nlvr2', split='train', task='ispeaker'): self.ds_name = ds_name self.split = split self.data = json.load( open(os.path.join(DATA_ROOT, self.ds_name, self.split + ".json")) ) self.tok = Tokenizer() self.tok.load(os.path.join(DATA_ROOT, self.ds_name, "vocab.txt")) self.feat_mean = np.load(os.path.join(DATA_ROOT, self.ds_name, 'feat_mean.npy')) self.feat_std = np.load(os.path.join(DATA_ROOT, self.ds_name, 'feat_std.npy'))
'nlvr2', 'spotdiff', 'adobe', ] ds_root = "../dataset/" for ds_name in DATASETS: print("Processing dataset %s" % ds_name) dataset = [] for split_name in ['train', 'valid']: dataset.extend( json.load( open(os.path.join(ds_root, ds_name, split_name + ".json")))) print("Finish Loading split %s" % split_name) print("Number of data is %d." % len(dataset)) sents = sum(map(lambda x: x["sents"], dataset), []) print("Number of sents is %d." % len(sents)) tok = Tokenizer() tok.build_vocab(sents, min_occur=3) tok.dump(os.path.join(ds_root, ds_name, "vocab.txt")) wordXnum = list(tok.occur.items()) wordXnum = sorted(wordXnum, key=lambda x: x[1], reverse=True) N = 50 print("Top %d Words:" % N) for word, num in wordXnum[:N]: print("%s: %d" % (word, num)) print()
op_code = (w >> 12) & 0o77 op_addr = w & 0xFFF if w & (1 << 18) != 0: # address is extended op_addr |= 0o70000 else: # long address command op_code = ((w >> 15) & 0o37) + 48 op_addr = w & 0o77777 if op_indx == 0: return f"{op_names[op_code]} {op_addr:>05o}" else: return f"{op_names[op_code]} {op_addr:>05o},M{op_indx:o}" if __name__ == '__main__': source = open(input_file).read() t = Tokenizer(source) PC = 0 DP = 0 irom = array.array('Q', [0xFFFFFFFFFFFFFFFF] * 65536) dram = array.array('Q', [0xFFFFFFFFFFFFFFFF] * 32768) fix_list = [] # tuples(pc, 'name', offset) names = {} bss_size = 0 error_count = 0 while not t.eof: if keyword := t.get('IDENT'): kwrd = keyword.val.lower() line = keyword.line
import re from collections import Counter import just import requests from nearnlp.nearnlp import is_noun, is_verb, singularize import functools from tok import Tokenizer from nltk.corpus import stopwords from nostalgia.enrichers.google.custom_search import google_custom_search ENGLISH_STOP = set(stopwords.words("english")) t = Tokenizer(True) t.drop("<b>", "remove html") t.drop("<b/>", "remove html") # can also use qoogle interesting_keys = set() for prefix in ["og:", "twitter:", ""]: for key in [ "title", "description", "name", "manufacturer_name", "category_name_singular", "long_description", "snippet", ]: interesting_keys.add(prefix + key)
def setUp(self): self.t = Tokenizer()