/
glove.py
64 lines (59 loc) · 1.53 KB
/
glove.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 16 21:21:46 2018
Embedding function
@author: ron
"""
import numpy as np
import pandas as pd
path = "/Users/ron/Desktop/samsung/glove.840B.300d.txt"
words = pd.read_table(path, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
dataFile ="atis-2.train.w-intent.iob"
utterances = list()
tags = list()
starts = list()
startid = list()
# reserving index 0 for padding
# reserving index 1 for unknown word and tokens
word_vocab_index = 2
tag_vocab_index = 2
word2id = {'<pad>': 0, '<unk>': 1}
tag2id = {'<pad>': 0, '<unk>': 1}
id2word = ['<pad>', '<unk>']
id2tag = ['<pad>', '<unk>']
utt_count = 0
temp_startid = 0
for line in open(dataFile, 'r'):
d=line.split('\t')
utt = d[0].strip()
t = d[1].strip()
if len(d) > 2:
start = np.bool(int(d[2].strip()))
starts.append(start)
if start:
temp_startid = utt_count
startid.append(temp_startid)
#print 'utt: %s, tags: %s' % (utt,t)
temp_utt = list()
temp_tags = list()
mywords = utt.split()
mytags = t.split()
if len(mywords) != len(mytags):
print (mywords)
print (mytags)
# now add the words and tags to word and tag dictionaries
# also save the word and tag sequence in training data sets
for i in range(len(mywords)):
try:
ebw = words.loc[mywords[i]]
if mytags[i] not in tag2id:
tag2id[mytags[i]] = tag_vocab_index
id2tag.append(mytags[i])
tag_vocab_index += 1
temp_utt.append(ebw)
temp_tags.append(tag2id[mytags[i]])
except:
pass
utterances.append(temp_utt)
tags.append(temp_tags)