コード例 #1
0
ファイル: data.py プロジェクト: pcuenca/VisualRelationships
    def __init__(self, ds_name='nlvr2', split='train', task='ispeaker'):
        self.ds_name = ds_name
        self.split = split
        self.data = json.load(
            open(os.path.join(DATA_ROOT, self.ds_name, self.split + ".json"))
        )

        self.tok = Tokenizer()
        self.tok.load(os.path.join(DATA_ROOT, self.ds_name, "vocab.txt"))

        self.feat_mean = np.load(os.path.join(DATA_ROOT, self.ds_name, 'feat_mean.npy'))
        self.feat_std = np.load(os.path.join(DATA_ROOT, self.ds_name, 'feat_std.npy'))
コード例 #2
0
    'nlvr2',
    'spotdiff',
    'adobe',
]

ds_root = "../dataset/"
for ds_name in DATASETS:
    print("Processing dataset %s" % ds_name)

    dataset = []
    for split_name in ['train', 'valid']:
        dataset.extend(
            json.load(
                open(os.path.join(ds_root, ds_name, split_name + ".json"))))
        print("Finish Loading split %s" % split_name)
    print("Number of data is %d." % len(dataset))
    sents = sum(map(lambda x: x["sents"], dataset), [])
    print("Number of sents is %d." % len(sents))

    tok = Tokenizer()
    tok.build_vocab(sents, min_occur=3)
    tok.dump(os.path.join(ds_root, ds_name, "vocab.txt"))

    wordXnum = list(tok.occur.items())
    wordXnum = sorted(wordXnum, key=lambda x: x[1], reverse=True)
    N = 50
    print("Top %d Words:" % N)
    for word, num in wordXnum[:N]:
        print("%s: %d" % (word, num))
    print()
コード例 #3
0
ファイル: asm.py プロジェクト: x86128/pymesm
        op_code = (w >> 12) & 0o77
        op_addr = w & 0xFFF
        if w & (1 << 18) != 0:  # address is extended
            op_addr |= 0o70000
    else:  # long address command
        op_code = ((w >> 15) & 0o37) + 48
        op_addr = w & 0o77777
    if op_indx == 0:
        return f"{op_names[op_code]} {op_addr:>05o}"
    else:
        return f"{op_names[op_code]} {op_addr:>05o},M{op_indx:o}"


if __name__ == '__main__':
    source = open(input_file).read()
    t = Tokenizer(source)

    PC = 0
    DP = 0
    irom = array.array('Q', [0xFFFFFFFFFFFFFFFF] * 65536)
    dram = array.array('Q', [0xFFFFFFFFFFFFFFFF] * 32768)

    fix_list = []  # tuples(pc, 'name', offset)
    names = {}
    bss_size = 0

    error_count = 0
    while not t.eof:
        if keyword := t.get('IDENT'):
            kwrd = keyword.val.lower()
            line = keyword.line
コード例 #4
0
import re
from collections import Counter
import just
import requests
from nearnlp.nearnlp import is_noun, is_verb, singularize
import functools
from tok import Tokenizer
from nltk.corpus import stopwords
from nostalgia.enrichers.google.custom_search import google_custom_search

ENGLISH_STOP = set(stopwords.words("english"))

t = Tokenizer(True)
t.drop("<b>", "remove html")
t.drop("<b/>", "remove html")

# can also use qoogle

interesting_keys = set()
for prefix in ["og:", "twitter:", ""]:
    for key in [
        "title",
        "description",
        "name",
        "manufacturer_name",
        "category_name_singular",
        "long_description",
        "snippet",
    ]:
        interesting_keys.add(prefix + key)
コード例 #5
0
ファイル: test_tok.py プロジェクト: anastasiaberyoza/python3
 def setUp(self):
     self.t = Tokenizer()