Ejemplo n.º 1
0
def get_vocabulary(fobj):
    """Read text and return dictionary that encodes vocabulary
    """
    # vocab = Counter()
    # for i, line in enumerate(fobj):
    #     for word in line.split():
    #         vocab[word] += 1
    # return vocab
    vocab = Counter()
    for i, line in enumerate(fobj):
        line = tokenizer(line)
        vocab_line = build_vocab(line)
        for item in vocab_line:
            vocab[item[0]] += item[1]
        if (i % 100000) == 0:
            print i
    # with open(args.input+".vocab.bpe",'w') as outfile:
    #     outfile.write(json.dumps(vocab))

    return vocab
Ejemplo n.º 2
0
	huffman_dict[key] = "<unk>"

common_vocab = jsondict['common']

for word in common_vocab.keys():
	huffman_dict[word] = word

print "Built dictionary"

dictmap = dict()
with open('../tutorial-ende-wmt15/data/wmap.test15.de','r') as infile:
	test_dict_file = infile.readlines()
	test_dict = dict()
	for i in test_dict_file:
		_tmp = i.replace('\n','').split()
		name = tokenizer(_tmp[0])
		dictmap[_tmp[-1]] = ' '.join(name)

print "Built key to vocab"

nbest = []
i_old = -1
with open('../tutorial-ende-wmt15/hiero/100best.txt') as nbestfile:
	for line in nbestfile:
		line = line.replace('\n','')
		line = line.split(' ||| ')
		i = int(line[0])
		if i != i_old:
			nbest.append([])
			i_old = i
		tmp = [dictmap[w] for w in line[1].split()]
Ejemplo n.º 3
0
		if i < MAX_SIZE:
			common_vocab[item[0]] = i
			i += 1
		else:
			rare_vocab[item[0]] = i
	return common_vocab, rare_vocab

if os.path.exists("../data/de/wmt15.vocab"):
	vocab = json.load(open("../data/de/wmt15.vocab",'r'))
else:
	inputfiles = ['../data/de/commoncrawl.de-en.de-sort.norm', '../data/de/news-commentary-v10.de-en.de-sort.norm', '../data/de/europarl-v7.de-sort.norm']
	vocab = collections.Counter('')
	for filename in inputfiles:
		with open(filename,'r') as infile:
			for i, line in enumerate(infile):
				line = tokenizer(line)
				vocab_line = build_vocab(line)
				for item in vocab_line:
					vocab[item[0]] += item[1]
				if (i % 100000) == 0:
					print i
	with open("../data/de/wmt15.vocab",'w') as outfile:
		outfile.write(json.dumps(vocab))

vocab = sorted(vocab.items(), key=lambda x: (-x[1], x[0]))
print("Total vocabulary size: "+str(len(vocab)))
common_vocab,rare_vocab = find_rare(vocab)

hman = open('dict.json', 'w')
hman.write(json.dumps({"common":common_vocab,'rare':rare_vocab}))
print "Built Huffman dictionary"
'''
Load and build dictionary
'''
tmp_set = set(huffman_dict.keys())
tmp_set2 = set(huffword_2_id.keys())

dictmap = dict()
oid_2_huffw = dict()
oid_2_huffid = dict()

with open('../tutorial-ende-wmt15/data/wmap.test15.de','r') as infile:
# with open('../data/news14/wmap.testf.de','r') as infile:
	for i in infile:
		_tmp = i.replace('\n','').split()
		name = tokenizer(_tmp[0])
		tmp = [[i] if i in tmp_set else ['<unk>'] for i in name]
		tmp = [j for l in tmp for j in l]
		# oid_2_huffw[_tmp[-1]] = tmp
		oid_2_huffid[_tmp[-1]] = [huffword_2_id[i] if i in tmp_set2 else huffword_2_id['<unk>'] for i in tmp]

de_map = open('huff_de.mapper_norm','w')
de_map.write(json.dumps(oid_2_huffid))

print "Built key to id"

# oid_2_huffid = json.load(open("huff_de.mapper_norm", 'r'))

# rev_dict = dict((value, key) for (key, value) in huffword_2_id.iteritems())

# for j in range(1,2738):
    rules.append(line.split())

dictmap = dict()
oid_2_huffw = dict()
oid_2_encodeid = dict()

unks = set()

tmp_set = set(encoded_dict.keys())
tmp_set2 = set(encodeword_2_id.keys())

with open('../tutorial-ende-wmt15/data/wmap.test15.de', 'r') as infile:
    # with open('../data/news14/wmap.testf.de','r') as infile:
    for i in infile:
        _tmp = i.replace('\n', '').split()
        name = tokenizer(_tmp[0])
        name = [j + "</w>" for j in name]
        empty = [j for j in name if j not in tmp_set]
        if empty != []:
            unks = unks.union(empty)

print "Found unknown words"

d = addwords(rules, unks, encodeword_2_id)
encoded_dict.update(d)

print "Added unknown words"

unks = set()

tmp_set = set(encoded_dict.keys())