Beispiel #1
0
def prepare_set_of_lexicon():
    # Step 1: generate lexicon
    lexicon = lex.generate_lexicon_from_textfile(TRAINING_FILE)
    lex.write_lexicon_to_file(LEX_FILE, lexicon)

    cutoff_lexicon = lex.generate_cutoff_lexicon(lexicon, './files_from_outside/english.stop.txt')
    lex.write_lexicon_to_file(CUTOFF_LEX_FILE, cutoff_lexicon)

    refined_cutoff_lexicon = lex.generate_refined_cutoff_lexicon(lexicon, './files_from_outside/english.stop.txt')
    lex.write_lexicon_to_file(REFINED_CUTOFF_LEX_FILE, refined_cutoff_lexicon)
            for (class_name, concepts) in self.class_and_concept.items():
                for concept in concepts:
                    output.write('%d\t%d\t%s\t%s\n' % (0, 0, class_name, concept))

            # <unk> to every possible concepts
            for concept in self.concepts:
                output.write('%d\t%d\t%s\t%s\n' % (0, 0, '<unk>', concept))

            # word_without_associated_class to the same word
            for word in self.word_without_associated_class:
                output.write('%d\t%d\t%s\t%s\n' % (0, 0, word, word))

            # write last line
            output.write('0')

if __name__ == '__main__':
    w2concept = W2Concept(CONCEPT_FILENAME)
    w2concept.set_word_without_associated_class('./w2class/word_without_class.txt')
    w2concept.write_w2concept_transducer('./w2concept/w2concept.fsm')

    lexicon = lex.read_lexicon_file(LEX_FILE)
    new_lexicon = w2concept.concepts.union(set(w2concept.class_and_concept.keys()))
    new_lexicon = new_lexicon.union(w2concept.word_without_associated_class)
    lexicon = lexicon.union(new_lexicon)
    lex.write_lexicon_to_file(LEX_FILE, list(lexicon))





    def update_lexicon(self, filename):
        lexicon = self.lexicon.union(self.class_name)
        lex.write_lexicon_to_file(filename, list(lexicon))

        self.write_w2class_transducer(self.symbols, '')
def read_base_lex(filename):
    lexicon = []
    with open(filename) as f:
        for line in f:
            lexicon.append(line.strip())

    return lexicon

def write_lexicon_to_file(filename, lexicon):
    """
    set(): lexicon
    """
    # remove empty string character
    try:
        lexicon.remove('')
    except:
        pass

    count = 0

    special_lex = ['<epsilon>', '<unk>']

    with open(filename, 'w') as output:
        for lex in special_lex + lexicon:
            output.write('%s\t%d\n' % (lex, count))
            count += 1

if __name__ == '__main__':
    lexicon = lex.generate_lexicon_from_textfile(TRAINING_FILE)
    lex.write_lexicon_to_file(LEX_FILE, lexicon)