forked from jonroberts/nasaMining
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
145 lines (107 loc) · 4.61 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from __future__ import unicode_literals
import json
from gensim.models.phrases import Phrases
from textblob import TextBlob
import cPickle as pickle
import argparse
def parse_input(input_json, input_source=None):
print input_json, ': Tokenizing descriptions'
desc = []
doc_id = []
dataset = json.load(open(input_json))['dataset']
for i, ds in enumerate(dataset):
if input_source:
ds['source'] = input_source
text = TextBlob(ds['description'])
for sentence in text.sentences:
desc.append(sentence.tokens)
doc_id.append(i)
return dataset, desc, doc_id
def construct_ngrams(desc, desc_seed=None, phrase_passes=5, phrase_threshold=10, model_output=None):
models = []
desc_seed = desc_seed or []
print 'Constructing ngrams'
for i in range(phrase_passes):
print '\t', i
model = Phrases(desc + desc_seed if i == 0 else desc, threshold=phrase_threshold)
desc = model[desc]
models.append(model)
if model_output:
with open(model_output, 'wb+') as f:
pickle.dump(models, f)
return desc
def extract(ngrams, dataset, doc_id):
# extract keywords
print 'Extracting keywords'
for i, ngram in enumerate(ngrams):
doc = doc_id[i]
if field not in dataset[doc]:
dataset[doc][field] = set()
if doc > 0 and doc % 1000 == 0:
print '\t', doc
for kw in filter(lambda k: '_' in k, ngram):
keyword = kw.replace('_', ' ')
kw_tb = TextBlob(keyword)
# filter out punctuation, etc (make sure that there are two non-punc words)
if len(kw_tb.words) < 2:
continue
# add keywords which are all proper nouns
distinct_tags = set(t[1] for t in kw_tb.tags)
if distinct_tags - {'NNP', 'NNPS'} == {}:
dataset[doc][field].add(kw_tb.lower())
continue
# add noun phrases
for np in kw_tb.lower().noun_phrases:
dataset[doc][field].add(np)
return kw_set_to_list(dataset)
def kw_set_to_list(dataset):
# convert set into list for json serialization
for d in dataset:
d[field] = list(d[field])
# fix 's
for i, np in enumerate(d[field]):
if np.endswith(" 's"):
np = np[:-3]
if np.startswith("'s "):
np = np.replace("'s ", "", 1)
np = np.replace(" 's", "'s")
d[field][i] = np
d[field] = list(set(d[field]))
return dataset
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='SpaceTag keyword extraction')
parser.add_argument('-i', "--input", type=str, default='data.json', help='path to a data.json format file')
parser.add_argument('-s', "--source", type=str, default="", help='data source annotation (e.g. data.nasa.gov/data.json')
parser.add_argument('-o', "--output", type=str, default='keywords.json', help='path to output the data with extracted keywords')
parser.add_argument('-f', "--field", type=str, default='description_ngram_np', help='field in each dataset to store the keywords')
parser.add_argument('-m', "--model", type=str, default='models.pkl', help='file to save the pickled phrase extraction models')
parser.add_argument('-p', "--passes", type=int, default=5, help="number of phrase extraction passes to make")
parser.add_argument('-t', "--threshold", type=int, default=10, help="phrase extraction significance threshold")
parser.add_argument("--seed", type=str, default=None, help="path to a data.json to seed the phrase extraction statistics with")
args = parser.parse_args()
input_json = args.input
input_source = args.source
seed_json = args.seed
output_file = args.output
model_output = args.model
field = args.field
phrase_passes = args.passes
phrase_threshold = args.threshold
# input_json = 'data/defense.json'
# input_source = 'defense.gov/data.json'
# seed_json = 'data/nasa.json'
# output_file = 'data/defense_ngram_np2.json'
# model_output = 'models.pkl'
# field = 'description_ngram_np'
# phrase_passes = 5
# phrase_threshold = 10
# parse input data
dataset, desc, doc_id = parse_input(input_json, input_source)
# parse secondary seed data
desc_seed = []
if seed_json:
_, desc_seed, _ = parse_input(seed_json)
ngrams = construct_ngrams(desc, desc_seed, phrase_passes, phrase_threshold, model_output)
dataset = extract(ngrams, dataset, doc_id)
with open(output_file, 'w') as f:
json.dump(dataset, f)