def main(train_dir, dev_dir, test_dir): # input_dir = "data/train/*.muc" vocabs_dir = "embedding/vocabs.json" counter = Counter() # num_sens = 0 read_file(train_dir, counter, update_kb=True) read_file(dev_dir, counter) read_file(test_dir, counter) print(counter) # print("Num sent train: %s" % num_sens) print("longest sentence: %s" % str(counter.longest_sen)) print("longest word: %s" % counter.longest_word()) kb_words = {k: list(v) for k, v in counter.kb_words.items()} json_dump(kb_words, "embedding/kb_words.json") word2idx = construct_word_embeddings(counter.word_vocab) char2idx = construct_char_embeddings(counter.char_vocab) pos2idx = construct_pos_embeddings(counter.pos_tags) ner2idx = construct_ner_embeddings(counter.ner_tags) vocabs = ObjectDict(make_dict(word2idx, char2idx, ner2idx, pos2idx), max_sen_len=counter.max_sen_len, max_word_len=counter.max_word_len) vocabs.save(vocabs_dir)
def __init__(self): self.max_sen_len = 0 self.longest_sen = None self.max_word_len = 0 self.word_vocab = set() self.char_vocab = set() self.pos_tags = set() self.ner_tags = set() self.kb_words = ObjectDict()
def __init__(self, words): self.words = [Word(w) for w in words] self.max_word_len = max(len(w) for w in self.words) self.word_vocab = set(w.normalized for w in self.words) self.char_vocab = [w.char_vocab for w in self.words] self.char_vocab = itertools.chain(*self.char_vocab) self.char_vocab = set(self.char_vocab) self.pos_tags = set(w.pos for w in self.words) self.ner_tags = set(w.ner for w in self.words) self.kb_words = ObjectDict() for w, pre_w in zip(self.words[1:], self.words[:-1]): if w.ner != 'O' and pre_w.ner == 'O' and not pre_w.should_ignore(): ner = w.ner[-3:] if ner not in self.kb_words: self.kb_words[ner] = set() self.kb_words[ner].add(pre_w.normalized)
def testBuildChoices(self): import logging from thb.common import build_choices import random from thb.characters.baseclasses import get_characters from thb import characters from game import autoenv # def build_choices(g, items, candidates, players, num, akaris, shared): log = logging.getLogger('test') g = ObjectDict({ 'players': BatchList([ ObjectDict({ 'account': ObjectDict({'userid': i}), 'reveal': lambda o, i=i: log.info('Reveal to %s: %s', i, o), }) for i in xrange(8) ]), 'random': random.Random(12341234), 'SERVER_SIDE': True, 'CLIENT_SIDE': False, }) autoenv.Game.getgame = staticmethod(lambda: g) chars = get_characters('common', '3v3') assert chars choices, imperial = build_choices(g, {}, chars, g.players, 10, 3, True) eq_(len(choices.items()), len(g.players)) eq_(len(set([id(i) for i in choices.values()])), 1) eq_(set(choices.keys()), set(g.players)) eq_(imperial, []) choices, imperial = build_choices( g, {0: ['imperial-choice:SpAya', 'foo']}, chars, g.players, 10, 3, True) eq_(len(choices.items()), len(g.players)) eq_(len(set([id(i) for i in choices.values()])), 1) eq_(set(choices.keys()), set(g.players)) p, c = imperial[0] eq_((p, c.char_cls), (g.players[0], characters.sp_aya.SpAya)) assert c in choices[p] del c eq_(sum([c.akari for c in choices[p]]), 3) choices, imperial = build_choices( g, {0: ['imperial-choice:SpAya', 'foo']}, chars, g.players, [4] * 8, [1] * 8, False) eq_(len(choices.items()), len(g.players)) eq_(len(set([id(i) for i in choices.values()])), 8) eq_(set(choices.keys()), set(g.players)) eq_([len(i) for i in choices.values()], [4] * 8) eq_([len([j for j in i if j.akari]) for i in choices.values()], [1] * 8) p, c = imperial[0] eq_((p, c.char_cls), (g.players[0], characters.sp_aya.SpAya)) assert c in choices[p]
def badge_metafunc(clsname, bases, _dict): _dict.pop('__module__') data = ObjectDict.parse(_dict) badges[clsname] = data
def tag_metafunc(clsname, bases, _dict): _dict.pop('__module__') data = ObjectDict.parse(_dict) tags[clsname] = data
def __init__(self, *a): from utils import ObjectDict self.uniform = ObjectDict() self.attrib = ObjectDict()