-
Notifications
You must be signed in to change notification settings - Fork 0
/
generateRelevantWords.py
143 lines (127 loc) · 3.82 KB
/
generateRelevantWords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
__author__ = 'mrtyormaa'
#
# Imports
#
from nltk.compat import raw_input
from scipy.io import mmread
import operator
import json
from gensim import corpora
import logging
#
# This is for logging data
#
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger('text_mining_logger')
# Pre-processing steps required to load the Dictionary and TFIDF
# Retrieve the dictionary
#
logger.info("Loading the dictionary.")
dictionary = corpora.Dictionary.load('files/pubMed-dictionary.dict')
#
# Let's tokenize the dictionary and get the word <-> id pairs.
#
word_to_id = dictionary.token2id
#
# inverse dictionary. This is easy for future lookups of ids
#
id_to_word = {val: key for key, val in word_to_id.items()}
#
# Retrieve the TFIDF
#
logger.info("Loading the TFIDF.")
tfidf_matrix = mmread('files/pubMed-tfidf.mm')
#
# User input
#
print("Enter a word for which you want co-occurance: ")
user_word = raw_input()
print("How many layers of output do you want to generate(0-5): ")
level = int(raw_input())
#
# This method returns all the documents containing the given word.
#
def fetchAllDocuments(param):
docs = []
id = word_to_id[param]
counter = 0
for col in tfidf_matrix.col:
if col == id:
docs.append(tfidf_matrix.row[counter])
counter += 1
return docs
# ** End of Function **
#
# This method returns all the words in the document.
#
def fetchAllWords(docId):
word_list_per_document = []
counter = 0
for row in tfidf_matrix.row:
if docId == row:
word_list_per_document.append([tfidf_matrix.col[counter], tfidf_matrix.data[counter]])
counter =counter + 1
return word_list_per_document
# ** End of Function **
#
# This is a recursive method used to generate the relevant words
#
def generateRelevantWords(param, lev):
docs = fetchAllDocuments(param)
relevant_words = {}
for doc in docs:
data = fetchAllWords(doc)
for dat in data:
#
# Ignore already visited words. This is to avoid repetition of words.
#
if id_to_word[dat[0]] in visited_words:
continue
if not dat[0] in relevant_words:
relevant_words[dat[0]] = 0.0
relevant_words[dat[0]] += dat[1]
#
# One of the base cases of recursion. If we don't find relevant words, return null array.
#
if not relevant_words:
return []
#
# We sort the words in decreasing order of their TFIDF values. This gives us tuples of
# word and tfidf. (word, tfidf)
#
word_tfidf_tuples = sorted(relevant_words.items(), key=operator.itemgetter(1), reverse=True)
#
# Let's limit the relevant words to 5 for each word.
#
word_tfidf_tuples = word_tfidf_tuples[0:5]
for word in word_tfidf_tuples:
visited_words.append(id_to_word[word[0]])
tree = []
for word in word_tfidf_tuples:
current_word = []
current_word.append(id_to_word[word[0]])
topic = {}
topic['words'] = current_word
topic['name'] = 'Topic_0'
if (lev == level - 1):
tree.append(topic)
else:
#
# Go another deeper level.
#
result = generateRelevantWords(id_to_word[word[0]], lev + 1)
if result:
topic['children'] = result
tree.append(topic)
return tree
# ** End of Function **
json_response = {}
visited_words = []
visited_words.append(user_word)
json_response['name'] = user_word
json_response['words'] = [user_word]
logger.info("Generating the Relevant Words.")
json_response['children'] = generateRelevantWords(user_word, 0)
logger.info("Saving the JSON file.")
with open('/Users/mrtyormaa/Sites/Hierarchie/app/data/pub-med.json', 'w') as outfile:
json.dump(json_response, outfile)