/
build_stemmed_corpus.py
41 lines (30 loc) · 1.11 KB
/
build_stemmed_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
'''
builds a stemmed corpus file
'''
import sys
from cancer_reader import CancerReader
def load_terms_file(terms_file):
word_id_map = {}
with open(terms_file, 'r') as terms_file_handle:
for i, term in enumerate(terms_file_handle):
word_id_map[term.strip()] = i
return word_id_map
if __name__ == '__main__':
terms_id_map = load_terms_file(sys.argv[1])
for thread in CancerReader.parse(sys.argv[2], stem = True):
for post in thread.posts:
stemmed_terms = post.content.split()
stemmed_id_terms = [terms_id_map[t] for t in stemmed_terms if t in terms_id_map]
stemmed_id_terms_str = ' '.join(map(str, stemmed_id_terms))
print '%(post_id)d|%(post_time_str)s|%(forum_id)d|%(thread_id)d|%(author_id)d|%(author_name)s|%(first_post_marker)d|%(title)s|%(content)s' %\
{
'post_id': post.post_id,
'post_time_str': post.post_time_str,
'forum_id': post.forum_id,
'thread_id': post.thread_id,
'author_id': post.author_id,
'author_name': post.author_name,
'first_post_marker': post.is_first_post,
'title': post.title,
'content': stemmed_id_terms_str
}