Exemple #1
0
    def fill_data(self, num_of_docs):
        num = 0
        for f in os.listdir(self.path_to_corpus):
            if num == num_of_docs:
                return
            print str(num), str(len(self.data.keys()))
            num += 1
            corpus = Corpus()
            corpus.fill(self.path_to_corpus + f)
            for sentence in corpus.sentences:
                for wordform in sentence.wordforms:
                    wf_noun = wordform
                    if wf_noun.upostag != 'NOUN' or wf_noun.deprel != 'ROOT':
                        continue
                    wf_verb = sentence.wordforms[int(wf_noun.head)]
                    if wf_verb.upostag != 'VERB' or wf_verb.lemma.lower(
                    ) == 'be':
                        continue
                    main_word = wf_verb.lemma.lower()
                    context = wf_noun.lemma.lower()

                    # needed_pos = sentence.wordforms[int(wordform.head)].upostag == 'VERB' and wordform.upostag == 'ADP'
                    # # conditions = wordform.xpostag not in self.pos_to_remove and wordform.lemma.lower()!='not'
                    # if not needed_pos:
                    #     continue
                    # main_word = sentence.wordforms[int(wordform.head)].lemma.lower()
                    # context = wordform.lemma.lower()
                    if main_word in self.data.keys():
                        if context in self.data[main_word].keys():
                            self.data[main_word][context] += 1
                        else:
                            self.data[main_word][context] = 1
                    else:
                        w = {context: 1}
                        self.data[main_word] = w