Esempio n. 1
0
    def init_language_models(self,
                             language_model=None,
                             pos_language_model=None,
                             edit_language_model=None):
        print "Init language models ..."
        pos = True
        clean_model_dir = os.path.dirname(os.path.realpath(__file__)) +\
            "/../data/lm_corpora"
        if language_model:
            self.lm = language_model
        else:
            print "No language model specified, using default switchboard one"
            lm_corpus_file = open(clean_model_dir +
                                  "/swbd_disf_train_1_clean.text")
            lines = [
                line.strip("\n").split(",")[1] for line in lm_corpus_file
                if "POS," not in line and not line.strip("\n") == ""
            ]
            split = int(0.9 * len(lines))
            lm_corpus = "\n".join(lines[:split])
            heldout_lm_corpus = "\n".join(lines[split:])
            lm_corpus_file.close()
            self.lm = KneserNeySmoothingModel(order=3,
                                              discount=0.7,
                                              partial_words=True,
                                              train_corpus=lm_corpus,
                                              heldout_corpus=heldout_lm_corpus,
                                              second_corpus=None)
        if pos_language_model:
            self.pos_lm = pos_language_model
        elif pos:
            print "No pos language model specified, \
            using default switchboard one"

            lm_corpus_file = open(clean_model_dir +
                                  "/swbd_disf_train_1_clean.text")
            lines = [
                line.strip("\n").split(",")[1] for line in lm_corpus_file
                if "POS," in line and not line.strip("\n") == ""
            ]
            split = int(0.9 * len(lines))
            lm_corpus = "\n".join(lines[:split])
            heldout_lm_corpus = "\n".join(lines[split:])
            lm_corpus_file.close()
            self.pos_lm = KneserNeySmoothingModel(
                order=3,
                discount=0.7,
                partial_words=True,
                train_corpus=lm_corpus,
                heldout_corpus=heldout_lm_corpus,
                second_corpus=None)
    if args.edit_input:
        print "Training edit term Language Model..."
        #stays the same across folds
        edit_lm_corpus_file = open(args.cleanModelDir + \
                                   "/swbd_disf_train_1_edit.text")
        edit_lines = [
            line.strip("\n").split(",")[1] for line in edit_lm_corpus_file
            if not "POS," in line and not line.strip("\n") == ""
        ]
        edit_split = int(0.9 * len(edit_lines))
        edit_lm_corpus = "\n".join(edit_lines[:edit_split])
        heldout_edit_lm_corpus = "\n".join(edit_lines[edit_split:])
        edit_lm = KneserNeySmoothingModel(
            train_corpus=edit_lm_corpus,
            heldout_corpus=heldout_edit_lm_corpus,
            order=2,
            discount=discount)

    dialogues = sorted(load_data_from_corpus_file(args.corpusFile))
    num_folds = 10
    fold_size = int(len(dialogues) * 1 / num_folds)  # 10 fold cross lm
    print "fold_size", fold_size
    folds = {}
    lm_corpus = {}  #lm corpus always
    pos_lm_corpus = {}  #pos tags corpus

    #1. From the dialogues get the language model strings in the fold
    #and the ranges for the output file/vectors
    previous_split = 0
    split = fold_size