def fill_data(self, num_of_docs): num = 0 for f in os.listdir(self.path_to_corpus): if num == num_of_docs: return print str(num), str(len(self.data.keys())) num += 1 corpus = Corpus() corpus.fill(self.path_to_corpus + f) for sentence in corpus.sentences: for wordform in sentence.wordforms: wf_noun = wordform if wf_noun.upostag != 'NOUN' or wf_noun.deprel != 'ROOT': continue wf_verb = sentence.wordforms[int(wf_noun.head)] if wf_verb.upostag != 'VERB' or wf_verb.lemma.lower( ) == 'be': continue main_word = wf_verb.lemma.lower() context = wf_noun.lemma.lower() # needed_pos = sentence.wordforms[int(wordform.head)].upostag == 'VERB' and wordform.upostag == 'ADP' # # conditions = wordform.xpostag not in self.pos_to_remove and wordform.lemma.lower()!='not' # if not needed_pos: # continue # main_word = sentence.wordforms[int(wordform.head)].lemma.lower() # context = wordform.lemma.lower() if main_word in self.data.keys(): if context in self.data[main_word].keys(): self.data[main_word][context] += 1 else: self.data[main_word][context] = 1 else: w = {context: 1} self.data[main_word] = w