/
pubmed_text_processor.py
94 lines (75 loc) · 2.76 KB
/
pubmed_text_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import nltk.tokenize
#!/usr/bin/env python
import nltk
##from nltk.corpus import brown
import fileinput
import keywords_extractor as kwe
__author__="ilya"
__date__ ="$23.10.2012 15:53:48$"
assepted_tags = ['NN', 'N', 'NNS', 'NNP']#['NN', 'NNP', 'N', 'NNS']
blacklisted_words = []
accepted_words = []
bacteria_words = dict()
def calculate_frequences(text):
tokens = nltk.word_tokenize(text)
tags = nltk.pos_tag(tokens)
tmp_dict = dict()
for tag in tags :
if tag[1] in assepted_tags :
tmp_dict[tag[0]] = 1
for entity in tmp_dict :
try:
bacteria_words[ entity ] += 1
except :
bacteria_words[ entity ] = 1
remove_notneeded_words(0)
#Removes unneeded words from bacteria_words dictionary
def remove_notneeded_words(ask_for_assistamce_flag):
global bacteria_words
read_accepted_words('accepted_words.txt')
for entity in bacteria_words :
if entity in blacklisted_words :
bacteria_words = removekey(bacteria_words, entity)
#if len(entity) < 4 :
# bacteria_words = removekey(bacteria_words, str(entity) )
else :
if len(entity) < 4 :
bacteria_words = removekey(bacteria_words, str(entity) )
continue
if ask_for_assistamce_flag == 1 :
if entity not in accepted_words :
#Ask user if he wants to add a word into blacklisted_words dictionary
print "A term '" + str(entity) +"' is not in blacklisted words. Should I put it to them (y/n)?"
s = raw_input('--> ')
if s == 'y' :
bacteria_words = removekey(bacteria_words, str(entity) )
blacklisted_words.append(str(entity).lower())
continue
accepted_words.append(str(entity).lower())
def removekey(d, key):
r = dict(d)
del r[key]
return r
def read_blacklisted_words(filename):
global blacklisted_words
for line in fileinput.input([filename]):
blacklisted_words.append(line.lower()[:len(line)-1])
def read_accepted_words(filename):
global accepted_words
for line in fileinput.input([filename]):
accepted_words.append(line.lower()[:len(line)-1])
def write_blacklisted_words(filename):
global blacklisted_words
f = open(filename, 'w')
for item in blacklisted_words:
f.write(item + '\n')
def write_accepted_words(filename):
global accepted_words
f = open(filename, 'w')
for item in accepted_words:
f.write(item + '\n')
def calculate_index_words(text):
#print text
kwe.make_keyword_candidates(text)
kwe.make_cooccurrence_matrix()
return kwe.make_keywords_list()