forked from EduardoCarvalho/nltkPhraseDetector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ngramSupportWriter.py
47 lines (40 loc) · 1.83 KB
/
ngramSupportWriter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import LineTokenizer
from itertools import izip, chain
from enviroment_vars import ReportEnviroments
class NGramSupportWriter(object):
def take_ngrams_by_topic_from_file(self,
ngram_directory,
ngram_file):
corpus = \
TaggedCorpusReader(ngram_directory,
ngram_file,
sent_tokenizer=LineTokenizer(blanklines='discard'),
encoding='utf-8')
corpus_paras = corpus.paras()[:]
k = corpus_paras[::2]
for i in range(2):
k = list(chain(*k))
v = corpus_paras[1::2]
ngrams_by_topic_from_file = \
{k.encode('utf-8'): list(set(chain(*v)))
for k, v in dict(izip(k, v)).items()}
return ngrams_by_topic_from_file
def merge_run_time_and_ngrams_from_file(self,
ngrams_by_topic_from_file,
run_time_ngrams_by_topic):
merged_run_time_and_ngrams_from_file = \
dict((k, list(set(chain(*[ngrams_by_topic_from_file[k],
run_time_ngrams_by_topic[k]]))))
for k in ngrams_by_topic_from_file.keys())
return merged_run_time_and_ngrams_from_file
def write_ngrams_in_a_file(self,
ngram_directory,
ngram_file,
ngram_content):
with open(ngram_directory+ngram_file, 'w') as f:
for k, v in ngram_content.items():
f.write('\n\n' + k + '\n\n')
for unigram in v:
f.write(unigram.encode('utf-8') + '\n')
f.close()