def prepare_set_of_lexicon(): # Step 1: generate lexicon lexicon = lex.generate_lexicon_from_textfile(TRAINING_FILE) lex.write_lexicon_to_file(LEX_FILE, lexicon) cutoff_lexicon = lex.generate_cutoff_lexicon(lexicon, './files_from_outside/english.stop.txt') lex.write_lexicon_to_file(CUTOFF_LEX_FILE, cutoff_lexicon) refined_cutoff_lexicon = lex.generate_refined_cutoff_lexicon(lexicon, './files_from_outside/english.stop.txt') lex.write_lexicon_to_file(REFINED_CUTOFF_LEX_FILE, refined_cutoff_lexicon)
for (class_name, concepts) in self.class_and_concept.items(): for concept in concepts: output.write('%d\t%d\t%s\t%s\n' % (0, 0, class_name, concept)) # <unk> to every possible concepts for concept in self.concepts: output.write('%d\t%d\t%s\t%s\n' % (0, 0, '<unk>', concept)) # word_without_associated_class to the same word for word in self.word_without_associated_class: output.write('%d\t%d\t%s\t%s\n' % (0, 0, word, word)) # write last line output.write('0') if __name__ == '__main__': w2concept = W2Concept(CONCEPT_FILENAME) w2concept.set_word_without_associated_class('./w2class/word_without_class.txt') w2concept.write_w2concept_transducer('./w2concept/w2concept.fsm') lexicon = lex.read_lexicon_file(LEX_FILE) new_lexicon = w2concept.concepts.union(set(w2concept.class_and_concept.keys())) new_lexicon = new_lexicon.union(w2concept.word_without_associated_class) lexicon = lexicon.union(new_lexicon) lex.write_lexicon_to_file(LEX_FILE, list(lexicon))
def update_lexicon(self, filename): lexicon = self.lexicon.union(self.class_name) lex.write_lexicon_to_file(filename, list(lexicon)) self.write_w2class_transducer(self.symbols, '')
def read_base_lex(filename): lexicon = [] with open(filename) as f: for line in f: lexicon.append(line.strip()) return lexicon def write_lexicon_to_file(filename, lexicon): """ set(): lexicon """ # remove empty string character try: lexicon.remove('') except: pass count = 0 special_lex = ['<epsilon>', '<unk>'] with open(filename, 'w') as output: for lex in special_lex + lexicon: output.write('%s\t%d\n' % (lex, count)) count += 1 if __name__ == '__main__': lexicon = lex.generate_lexicon_from_textfile(TRAINING_FILE) lex.write_lexicon_to_file(LEX_FILE, lexicon)