-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_conceptnet.py
77 lines (66 loc) · 2.56 KB
/
fetch_conceptnet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
from snli import SNLI
from wordembedding import WordEmbedding
from utils import printerr
import cPickle
def query_related_terms(word, use_related=False):
ret = set()
query_prefix = 'http://api.conceptnet.io/query?node=/c/en/'
obj = requests.get(query_prefix + word).json()
for rel in obj['edges']:
if rel['end'].has_key('term') and rel['end'].has_key('language') \
and rel['end']['language'] == 'en':
ret.add(rel['end']['label'])
if rel['start'].has_key('term') and rel['start'].has_key('language') \
and rel['start']['language'] == 'en':
ret.add(rel['start']['label'])
if use_related:
relation_prefix = 'http://api.conceptnet.io/related/c/en/'
relation_postfix = '?filter=/c/en'
obj = requests.get(relation_prefix + word + relation_postfix).json()
for rel in obj['related']:
ret.add(rel['@id'].replace('_', ' ').split('/')[-1])
if word in ret:
ret.remove(word)
return ret
def tree2set(t):
global _set
_set = set()
def func(node):
_set.add(node.get_str4conceptnet())
t.postorder_traverse(func)
return _set
word_embedding = WordEmbedding('./data/wordembedding')
snli = SNLI('./data/')
printerr("Before trim word embedding, " + str(word_embedding.embeddings.size(0)) + " words")
word_embedding.trim_by_counts(snli.word_counts)
printerr("After trim word embedding, " + str(word_embedding.embeddings.size(0)) + " words")
word_embedding.extend_by_counts(snli.train_word_counts)
printerr("After adding training words, " + str(word_embedding.embeddings.size(0)) + " words")
phrases = set()
print 'Gathering phrases in train data...'
for data in snli.train:
phrases = phrases | tree2set(data['p_tree'])
phrases = phrases | tree2set(data['h_tree'])
print 'done'
printerr('Gathering phrases in dev data...')
for data in snli.dev:
phrases = phrases | tree2set(data['p_tree'])
phrases = phrases | tree2set(data['h_tree'])
print 'done'
print 'Gathering phrases in test data...'
for data in snli.test:
phrases = phrases | tree2set(data['p_tree'])
phrases = phrases | tree2set(data['h_tree'])
print 'done'
print 'total num of phrases:', len(phrases)
related_terms = {}
idx = 0
for phrase in phrases:
related_terms[phrase] = query_related_terms(phrase)
idx += 1
print '\rquerying', str(idx)+'/'+str(len(phrases)),
print ' '
with open('./data/dict_concept_related_terms.pickle', 'wb') as f:
print 'saving dict to' + './data/dict_concept_related_terms.pickle'
cPickle.dump(related_terms, f)