def get_vocabulary(fobj): """Read text and return dictionary that encodes vocabulary """ # vocab = Counter() # for i, line in enumerate(fobj): # for word in line.split(): # vocab[word] += 1 # return vocab vocab = Counter() for i, line in enumerate(fobj): line = tokenizer(line) vocab_line = build_vocab(line) for item in vocab_line: vocab[item[0]] += item[1] if (i % 100000) == 0: print i # with open(args.input+".vocab.bpe",'w') as outfile: # outfile.write(json.dumps(vocab)) return vocab
huffman_dict[key] = "<unk>" common_vocab = jsondict['common'] for word in common_vocab.keys(): huffman_dict[word] = word print "Built dictionary" dictmap = dict() with open('../tutorial-ende-wmt15/data/wmap.test15.de','r') as infile: test_dict_file = infile.readlines() test_dict = dict() for i in test_dict_file: _tmp = i.replace('\n','').split() name = tokenizer(_tmp[0]) dictmap[_tmp[-1]] = ' '.join(name) print "Built key to vocab" nbest = [] i_old = -1 with open('../tutorial-ende-wmt15/hiero/100best.txt') as nbestfile: for line in nbestfile: line = line.replace('\n','') line = line.split(' ||| ') i = int(line[0]) if i != i_old: nbest.append([]) i_old = i tmp = [dictmap[w] for w in line[1].split()]
if i < MAX_SIZE: common_vocab[item[0]] = i i += 1 else: rare_vocab[item[0]] = i return common_vocab, rare_vocab if os.path.exists("../data/de/wmt15.vocab"): vocab = json.load(open("../data/de/wmt15.vocab",'r')) else: inputfiles = ['../data/de/commoncrawl.de-en.de-sort.norm', '../data/de/news-commentary-v10.de-en.de-sort.norm', '../data/de/europarl-v7.de-sort.norm'] vocab = collections.Counter('') for filename in inputfiles: with open(filename,'r') as infile: for i, line in enumerate(infile): line = tokenizer(line) vocab_line = build_vocab(line) for item in vocab_line: vocab[item[0]] += item[1] if (i % 100000) == 0: print i with open("../data/de/wmt15.vocab",'w') as outfile: outfile.write(json.dumps(vocab)) vocab = sorted(vocab.items(), key=lambda x: (-x[1], x[0])) print("Total vocabulary size: "+str(len(vocab))) common_vocab,rare_vocab = find_rare(vocab) hman = open('dict.json', 'w') hman.write(json.dumps({"common":common_vocab,'rare':rare_vocab}))
print "Built Huffman dictionary" ''' Load and build dictionary ''' tmp_set = set(huffman_dict.keys()) tmp_set2 = set(huffword_2_id.keys()) dictmap = dict() oid_2_huffw = dict() oid_2_huffid = dict() with open('../tutorial-ende-wmt15/data/wmap.test15.de','r') as infile: # with open('../data/news14/wmap.testf.de','r') as infile: for i in infile: _tmp = i.replace('\n','').split() name = tokenizer(_tmp[0]) tmp = [[i] if i in tmp_set else ['<unk>'] for i in name] tmp = [j for l in tmp for j in l] # oid_2_huffw[_tmp[-1]] = tmp oid_2_huffid[_tmp[-1]] = [huffword_2_id[i] if i in tmp_set2 else huffword_2_id['<unk>'] for i in tmp] de_map = open('huff_de.mapper_norm','w') de_map.write(json.dumps(oid_2_huffid)) print "Built key to id" # oid_2_huffid = json.load(open("huff_de.mapper_norm", 'r')) # rev_dict = dict((value, key) for (key, value) in huffword_2_id.iteritems()) # for j in range(1,2738):
rules.append(line.split()) dictmap = dict() oid_2_huffw = dict() oid_2_encodeid = dict() unks = set() tmp_set = set(encoded_dict.keys()) tmp_set2 = set(encodeword_2_id.keys()) with open('../tutorial-ende-wmt15/data/wmap.test15.de', 'r') as infile: # with open('../data/news14/wmap.testf.de','r') as infile: for i in infile: _tmp = i.replace('\n', '').split() name = tokenizer(_tmp[0]) name = [j + "</w>" for j in name] empty = [j for j in name if j not in tmp_set] if empty != []: unks = unks.union(empty) print "Found unknown words" d = addwords(rules, unks, encodeword_2_id) encoded_dict.update(d) print "Added unknown words" unks = set() tmp_set = set(encoded_dict.keys())