-
Notifications
You must be signed in to change notification settings - Fork 1
/
stemmed_index.py
52 lines (42 loc) · 1.75 KB
/
stemmed_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pickle
from string import punctuation
import os
from collections import defaultdict
import nltk
path = "/Users/fathimakhazana/Documents/IRFinalProject/cacm_stem.txt"
corpusdir = "/Users/fathimakhazana/Documents/IRFinalProject/ParsedFiles/"
def remove_punctuations(content):
content = ''.join(characters for characters in content if characters not in '!}{][)(\><=#"$%&,/*`\'')
punc = punctuation.replace('-',"")
content = ' '.join(token.strip(punc) for token in content.split() if token.strip(punc))
content = ' '.join(token.replace("'", "") for token in content.split() if token.replace("'", ""))
content = ' '.join(content.split())
return content
def get_Tokens(index,stemmed_dict):
tokens = nltk.word_tokenize(stemmed_dict[index + 1])
return tokens
def create_corpus_dict(stem):
stemmed_text = {}
for i in range(1,3205):
stemmed_text[i] = remove_punctuations(stem[i])
return stemmed_text
def create_index(stemmed_dict):
files = os.listdir(corpusdir)
files = [f for f in files if f.endswith('.txt')]
stemmed_unigrams = defaultdict(list)
for index,filename in enumerate(sorted(files)):
count = defaultdict(int)
for word in get_Tokens(index,stemmed_dict):
count[word]+=1
for word in count.keys():
stemmed_unigrams[word].append([filename,count[word]])
return stemmed_unigrams
def main():
stemmed_corpus = open(path, "r").read()
stemmed_corpus = stemmed_corpus.split('#')
stemmed_dict = create_corpus_dict(stemmed_corpus)
stemmed_index = create_index(stemmed_dict)
with open('stemmed_indexer.pickle','wb') as f:
pickle.dump(stemmed_index,f)
if __name__ == "__main__":
main()