Exemple #1
0
    def initialize_detector(self):
        t1 = time.time()
        try:
            import kenlm
            self.lm = kenlm.Model(self.language_model_path)
            logger.debug('Loaded language model: %s, spend: %s s' %
                         (self.language_model_path, str(time.time() - t1)))
            t1 = time.time()
            self.lm_word = kenlm.Model(self.language_word_model_path)
            logger.debug(
                'Loaded language model: %s, spend: %s s' %
                (self.language_word_model_path, str(time.time() - t1)))
        except Exception:
            raise ImportError(
                'mypycorrector dependencies are not fully installed, '
                'they are required for statistical language model.'
                'Please use "pip install kenlm" to install it.'
                'if you are Win, Please install kenlm in cgwin.')
        t1 = time.time()
        # # 同音词
        self.word_similar = readjson(config.word_similar_path)
        # 词、频数dict
        self.char_freq = self.load_word_freq_dict(config.char_freq_path)
        self.custom_word_freq = {}

        detector_dict = DetectorDict()
        detector_dict.check_detector_dict_initialized()
        self.jieba_tokenizer = detector_dict.tokenizer
        self.word_freq = detector_dict.word_freq
        logger.debug('Loaded file: %s, spend: %s s' %
                     (config.word_freq_path, str(time.time() - t1)))
        self.initialized_detector = True
Exemple #2
0
    def __init__(self, resources, n=5):

        self.gigaword_char = kenlm.Model(resources["gigaword_char"])
        self.gigaword_word = kenlm.Model(resources["gigaword_word"])
        self.stackoverflow_char = kenlm.Model(resources["stackoverflow_char"])
        self.stackoverflow_word = kenlm.Model(resources["stackoverflow_word"])

        self.N = n
        self.binner = GaussianBinner(100)
Exemple #3
0
def main():
    global rrp
    trainArticles = pd.preprocessData("balancedTrainingData.dat",
                                      "balancedTrainingDataLabels.dat")
    perplexity_true = []
    perplexity_fake = []
    perplexity = []

    tri_model = kenlm.Model('fake_pos_2g.binary')
    quad_model = kenlm.Model('fake_pos_3g.binary')
    five_model = kenlm.Model('fake_pos_4g.binary')

    ratioTriQuad = []
    ratioTriFive = []

    for article in trainArticles:
        num_sentences = article.numberOfSentences
        tri_score = 0
        quad_score = 0
        five_score = 0

        for sentence in article.allSentences:
            pos_tags = rrp.tag(sentence.string)
            words, tags = zip(*pos_tags)
            sentence.string = ' '.join(tags)
            tri_score += float(tri_model.perplexity(sentence.string))
            quad_score += float(quad_model.perplexity(sentence.string))
            five_score += float(five_model.perplexity(sentence.string))

        tri_score = float(tri_score) / num_sentences
        quad_score = float(quad_score) / num_sentences
        five_score = float(five_score) / num_sentences

        ratioTriQuad.append(float(quad_score) / tri_score)
        ratioTriFive.append(float(five_score) / tri_score)

        # perplexity.append(score)
        # if article.label == 0:
        #     perplexity_fake.append(score)
        # else:
        #     perplexity_true.append(score)

    fp = open('ratioBiTri_pos_train.txt', 'w')

    for item in ratioTriQuad:
        fp.write(str(item))
        fp.write('\n')

    fp.close()

    fp = open('ratioBiQuad_pos_train.txt', 'w')

    for item in ratioTriFive:
        fp.write(str(item))
        fp.write('\n')

    fp.close()
    def _load_lms(self, char_lm_dir, word_lm_dir):
        config = kenlm.Config()
        config.show_progress = False
        config.arpa_complain = kenlm.ARPALoadComplain.NONE

        for label in self._labels:
            char_lm_path = Path(char_lm_dir, '{}.arpa'.format(label))
            word_lm_path = Path(word_lm_dir, '{}.arpa'.format(label))
            self._char_lms[label] = kenlm.Model(str(char_lm_path), config)
            self._word_lms[label] = kenlm.Model(str(word_lm_path), config)
Exemple #5
0
    def __init__(self, data_dir, use_posthoc_correction=True):
        self.use_posthoc_correction = use_posthoc_correction
        self.data_dir = data_dir

        lm_title_abstracts = kenlm.Model(
            os.path.join(data_dir, 'titles_abstracts_lm.binary'))
        lm_authors = kenlm.Model(os.path.join(data_dir, 'authors_lm.binary'))
        lm_venues = kenlm.Model(os.path.join(data_dir, 'venues_lm.binary'))
        self.lms = (lm_title_abstracts, lm_authors, lm_venues)

        with open(os.path.join(data_dir, 'lightgbm_model.pickle'), 'rb') as f:
            self.model = pickle.load(f)
    def __init__(
        self,
        cfg: UnpairedAudioTextConfig,
        source_dictionary=None,
        target_dictionary=None,
    ):
        super().__init__(cfg)

        self._target_dictionary = target_dictionary
        self._source_dictionary = source_dictionary
        self.num_symbols = (
            len([s for s in target_dictionary.symbols if not s.startswith("madeup")])
            - target_dictionary.nspecial
        )
        self.sil_id = (
            target_dictionary.index("<SIL>") if "<SIL>" in target_dictionary else -1
        )
        self.kenlm = None
        if cfg.kenlm_path is not None:
            import kenlm

            self.kenlm = kenlm.Model(cfg.kenlm_path)

        self.word_kenlm = None
        if cfg.word_kenlm_path is not None:
            import kenlm

            self.word_kenlm = kenlm.Model(cfg.word_kenlm_path)

        self.uppercase = cfg.uppercase
        self.skipwords = set(cfg.skipwords.split(","))

        def str_postprocess(s):
            s = " ".join(w for w in s.split() if w not in self.skipwords)
            s = s.upper() if self.uppercase else s
            return s

        self.str_postprocess = str_postprocess
        self.compute_lm_score = lambda s: self.kenlm.score(self.str_postprocess(s))

        self.compute_word_score = None
        if cfg.word_decoder_config is not None:
            self.kaldi_decoder = KaldiDecoder(cfg.word_decoder_config, beam=10)

            def compute_word_score(logits, padding):
                res = self.kaldi_decoder.decode(logits, padding)
                for r in res:
                    r = r.result()
                    assert len(r) == 1
                    r = r[0]
                    yield r["score"], r["words"]

            self.compute_word_score = compute_word_score
def compute_features_for_the_train_data(train_data_file,
                                        features_save_data_file):
    vp_2gram_model = kenlm.Model(VP_2gram_LM)
    # nvp_2gram_model = kenlm.Model(NVP_2gram_LM)
    vp_3gram_model = kenlm.Model(VP_3gram_LM)
    # nvp_3gram_model = kenlm.Model(NVP_3gram_LM)
    with open(train_data_file, "r") as train_tsv, open(features_save_data_file,
                                                       "w") as features_tsv:
        reader = csv.reader(train_tsv, delimiter='\t')
        writer = csv.writer(features_tsv, delimiter='\t')

        head_row = next(reader)
        new_head_row = head_row[0:-1] + feature_names
        new_head_row.append(head_row[-1])
        writer.writerow(new_head_row)
        start_time = time.time()
        for i, row in enumerate(reader):
            question = row[0]
            answer = row[1]
            response = row[2]
            question = question.strip()
            question = mt.tokenize(question, return_str=True, escape=False)
            question = question.lower()
            answer = answer.strip()
            answer = answer.lower()
            response = response.strip()
            response = mt.tokenize(response, return_str=True, escape=False)
            response = response.lower()
            row[0] = question
            row[1] = answer
            row[2] = response
            count = int(row[-1])
            if i % 1000 == 0:
                print(i, "time :", time.time() - start_time, "secs")
                start_time = time.time()
            # if count == 0:
            # 	p = random.uniform(0, 1)
            # 	if p > 0.05:
            # 		continue
            features = extract_features(question, answer, response,
                                        vp_2gram_model, vp_3gram_model)

            final_row = row[0:-1] + features
            # print(len(row), len(features), len(final_row))
            final_row.append(row[-1])
            # print(question, answer, response)
            # print(final_row[:8])
            writer.writerow(final_row)
Exemple #8
0
    def __init__(self,
                 dictionary: StaticDictionary,
                 window=1,
                 lm_file=None,
                 *args,
                 **kwargs):

        super().__init__(*args, **kwargs)
        self.costs = defaultdict(itertools.repeat(float('-inf')).__next__)
        self.dictionary = dictionary
        self.window = window
        if self.window == 0:
            self.find_candidates = self._find_candidates_window_0
        else:
            self.find_candidates = self._find_candidates_window_n
        self.costs[('', '')] = log(1)
        self.costs[('⟬', '⟬')] = log(1)
        self.costs[('⟭', '⟭')] = log(1)

        for c in self.dictionary.alphabet:
            self.costs[(c, c)] = log(1)
        # if self.ser_path.is_file():
        self.load()

        if lm_file:
            self.lm = kenlm.Model(str(expand_path(lm_file)))
            self.beam_size = 4
            self.candidates_count = 4
            self._infer_instance = self._infer_instance_lm
Exemple #9
0
def main():
    args = get_parser().parse_args()
    logger.debug(f"Args: {args}")
    
    ref_uid_to_tra = load_tra(args.ref_tra)
    hyp_uid_to_tra = load_tra(args.hyp_tra)
    assert not bool(set(hyp_uid_to_tra.keys()) - set(ref_uid_to_tra.keys()))

    lm = kenlm.Model(args.kenlm_path)
    skipwords = set(args.skipwords.split(","))
    def compute_lm_score(s):
        s = " ".join(w for w in s.split() if w not in skipwords)
        s = s.upper() if args.uppercase else s
        return lm.score(s)

    g2p, g2p_dict = None, None
    if args.phonemize:
        if args.phonemize_lexicon:
            g2p_dict = load_lex(args.phonemize_lexicon)
        else:
            g2p = G2p()

    wer = compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p, g2p_dict)
    lm_ppl = compute_lm_ppl(hyp_uid_to_tra, compute_lm_score)
    
    gt_wer = -math.inf
    if args.gt_tra:
        gt_uid_to_tra = load_tra(args.gt_tra)
        gt_wer = compute_wer(gt_uid_to_tra, hyp_uid_to_tra, None, None)

    score = math.log(lm_ppl) * max(wer, args.min_vt_uer)
    logging.info(f"{args.hyp_tra}: score={score:.4f}; wer={wer*100:.2f}%; lm_ppl={lm_ppl:.4f}; gt_wer={gt_wer*100:.2f}%")
Exemple #10
0
def load_sentence_aggregation(db_names, sa_name, n=None):

    if sa_name == 'random':

        def random_sa_scorer(sas):
            return get_random_scores(len(sas))

        return random_sa_scorer

    if sa_name == 'markov':

        db_names_id = '_'.join(sorted(db_names))

        lm_filename = f'sa_lm_model_{n}_{db_names_id}.arpa'
        lm_filepath = os.path.join(PRETRAINED_DIR, lm_filename)

        model = kenlm.Model(lm_filepath)

        def scorer(sas):
            # sas é uma lista de triplas particionadas
            #    por triplas particionadas, quero dizer uma lista de listas de triplas
            #    ex: [[t1, t2], [t3]]

            scores = []
            for sa in sas:
                pred_text = preprocess_to_sa_model(sa)
                score = model.score(pred_text)
                scores.append(score)

            return scores

        return scorer
def main(doc2vec_file, lm_file, comp_pred_file, candidates_file, \
         complex_file, weights, dress_file, diff, output_file):
    ## Get complex sentences
    print("Reading in complex sentences...")
    complex_sents = get_sents(complex_file)
    print(len(complex_sents))
    print(complex_sents[0])

    ## Get dress sentences
    print("Reading in DRESS sentences...")
    dress_sents = get_sents(dress_file)
    print(len(complex_sents))
    print(complex_sents[0])

    ## Gets candidate simplifications
    print("Reading in candidates...")
    sentences = get_simple_sents(candidates_file)
    print(len(sentences))
    print(len(sentences[0]))

    ## Loads language model
    print("Loading kenlm model...")
    lm = kenlm.Model(lm_file)

    ## Get perplexity scores for each candidate sentence
    print("Calculating perplexities...")
    perplexities = get_perplexities(sentences, lm)

    ## Loads sentence complexity predictions
    print("Getting complexity predictions...")
    comp_preds = load_comp_preds(comp_pred_file, sentences)

    ## Loads doc2vec model
    print("Loading doc2vec model...")
    doc2vec = g.Doc2Vec.load(doc2vec_file)

    ## Gets embeddings for test sentences
    print("Getting complex embeddings...")
    start_alpha = 0.01
    infer_epoch = 1000
    complex_embeddings = get_complex_embeddings(complex_sents, doc2vec,
                                                start_alpha, infer_epoch)
    #complex_embeddings = [[0 for i in range(300)] for sent in complex_sents]

    ## Gets embeddings for each sentence
    print("Getting embeddings...")
    embeddings = get_embeddings(sentences, doc2vec, start_alpha, infer_epoch)
    #embeddings = [[[0 for i in range(300)] for sent in sents] for sents in sentences]

    ## Calculate cosine similarities between complex and simple sentences
    print("Calculating similarities...")
    similarities = get_sims(complex_embeddings, embeddings, sentences)
    #similarities = [[1 for i in range(len(s))] for s in sentences]

    print("Rerank sentences...")
    ## Reranks sentences based on average of fluency, relevancy, and simplicity
    top_sentences = rank_candidates(sentences, dress_sents, perplexities,
                                    comp_preds, similarities, weights, diff)

    save_sentences(top_sentences, output_file)
Exemple #12
0
def train_ngram_lm(kenlm_path, data_path, output_path, N, dedup_data_path=None):
    """
    Trains a modified Kneser-Ney n-gram KenLM from a text file.
    Creates a .arpa file to store n-grams.
    """
    dedup_data_path = dedup_data_path or data_path

    # create .arpa file of n-grams
    curdir = os.path.abspath(os.path.curdir)

    # deduplicate_cmd = "cat {} | sort -u > {}".format(
    #     os.path.join(curdir, data_path), os.path.join(curdir, dedup_data_path)
    #     )
    # os.system(deduplicate_cmd)

    estimate_cmd = "bin/lmplz -o {} --discount_fallback <{} >{} ".format(
        N, os.path.join(curdir, data_path), os.path.join(curdir, output_path)
        )
    os.system("cd "+os.path.join(kenlm_path, 'build')+" && "+estimate_cmd)
    
    load_kenlm()
    # create language model
    model = kenlm.Model(os.path.join(curdir, output_path))

    return model
def main():
    trainArticles = pd.preprocessData("nltk_train.txt",
                                      "trainingSetLabels.dat")
    perplexity_true = []
    perplexity_fake = []
    perplexity = []
    model = kenlm.Model('pos4g_fresh.arpa')

    for article in trainArticles:
        num_sentences = article.numberOfSentences
        score = 0
        for sentence in article.allSentences:
            score += float(model.perplexity(sentence.string))

        score = float(score) / num_sentences
        perplexity.append(score)
        if article.label == 0:
            perplexity_fake.append(score)
        else:
            perplexity_true.append(score)

    fp = open('pos_4_train.txt', 'w')

    for item in perplexity:
        fp.write(str(item))
        fp.write('\n')

    fp.close()
Exemple #14
0
    def langModelFeat(self, argString, preprocessReq=0):
        '''
        Extracts n-gram Language Model preplexity features.
        '''
        ngramOrder = 3
        langModel = 0
        # Binary1/0,ngramOrder,LMFilePath(ifBinary1)
        arguments = argString.split(',')
        if (int(arguments[0])):
            # Use given langModel
            langModel = "\"{0}\"".format(arguments[-1])

        ngramOrder = int(arguments[1])

        if preprocessReq:
            # Request all preprocessing functions to be prepared
            if not langModel:
                langModel = self.preprocessor.buildLanguageModel(ngramOrder)
            self.preprocessor.getInputFileName()
            self.preprocessor.getBinariesPath()
            return 1

        sentsFile = self.preprocessor.getInputFileName()
        srilmBinary, kenlm = self.preprocessor.getBinariesPath()

        if not langModel:
            langModel = self.preprocessor.buildLanguageModel(ngramOrder)

        if srilmBinary and not kenlm:
            pplFile = "tempLang{0}{1}.ppl".format(os.path.basename(sentsFile),
                                                  ngramOrder)
            command = "\"{0}ngram\" -order {1} -lm {2} -ppl {3} -debug 1 -unk> {4}".format(
                srilmBinary, ngramOrder, langModel, sentsFile, pplFile)

            subprocess.call(command, shell=True)
            probab = self.extractValues(pplFile,
                                        self.preprocessor.getSentCount())
            os.remove(pplFile)
            return sparse.lil_matrix(probab)
        else:
            try:
                __import__('imp').find_module('kenlm')
                import kenlm
                model = kenlm.Model(langModel)
                probab = []
                for sent in self.preprocessor.getPlainSentences():
                    probab.append([
                        model.score(sent, bos=True, eos=True),
                        model.perplexity(sent)
                    ])
                output = sparse.lil_matrix(probab)
                return output
            except ImportError:
                import pynlpl.lm.lm as pineApple
                arpaLM = pineApple.ARPALanguageModel(langModel)
                probab = []
                for sent in self.preprocessor.gettokenizeSents():
                    probab.append([arpaLM.score(sent)])
                output = sparse.lil_matrix(probab)
                return output
Exemple #15
0
def main():
    args = parse()

    model = kenlm.Model('bigram')

    perplexities = []
    sample_size = 1000

    with open(
            '/u/demorali/corpora/1g-word-lm-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en-00000-of-00100',
            'r') as f:
        for line in f:
            perplexities.append(model.perplexity(line))
            if len(perplexities) == sample_size:
                break

    if args.csv is None:
        print("""Test sentence count: {size}
            mean is {mean}
            max is {max}
            min is {min}
            """.format(mean=mean(perplexities),
                       max=max(perplexities),
                       min=min(perplexities),
                       size=sample_size))
    else:
        with open(args.csv, 'a') as f:
            writer = csv.writer(f)
            writer.writerow(
                [mean(perplexities),
                 max(perplexities),
                 min(perplexities)])
Exemple #16
0
 def initialize_detector(self):
     t1 = time.time()
     self.lm = kenlm.Model(self.language_model_path)
     t2 = time.time()
     default_logger.debug('Loaded language model: %s, spend: %s s' %
                          (self.language_model_path, str(t2 - t1)))
     # 词、频数dict
     self.word_freq = self.load_word_freq_dict(self.word_freq_path)
     t3 = time.time()
     default_logger.debug(
         'Loaded word freq file: %s, size: %d, spend: %s s' %
         (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
     # 自定义混淆集
     self.custom_confusion = self._get_custom_confusion_dict(
         self.custom_confusion_path)
     t4 = time.time()
     default_logger.debug(
         'Loaded confusion file: %s, size: %d, spend: %s s' %
         (self.custom_confusion_path, len(
             self.custom_confusion), str(t4 - t3)))
     # 自定义切词词典
     self.custom_word_dict = self.load_word_freq_dict(self.custom_word_path)
     # 合并切词词典及自定义词典
     self.word_freq.update(self.custom_word_dict)
     t5 = time.time()
     default_logger.debug(
         'Loaded custom word file: %s, size: %d, spend: %s s' %
         (self.custom_confusion_path, len(
             self.custom_word_dict), str(t5 - t4)))
     self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                custom_word_freq_dict=self.custom_word_dict,
                                custom_confusion_dict=self.custom_confusion)
     t6 = time.time()
     default_logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1))
     self.initialized_detector = True
def klm_perplexity(string):
	model=kenlm.Model('kenlm/lm/test.arpa') 
	#print(string)
	per=model.perplexity(string)

	# print(per)
	return(per)
Exemple #18
0
    def initialize_detector(self):
        t1 = time.time()
        try:
            import kenlm
        except ImportError:
            raise ImportError(
                'mypycorrector dependencies are not fully installed, '
                'they are required for statistical language model.'
                'Please use "pip install kenlm" to install it.'
                'if you are Win, Please install kenlm in cgwin.')

        self.lm = kenlm.Model(self.language_model_path)
        logger.debug('Loaded language model: %s, spend: %s s' %
                     (self.language_model_path, str(time.time() - t1)))

        # 词、频数dict
        t2 = time.time()
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        self.char_freq = self.load_char_freq_dict(self.char_freq_path)
        t3 = time.time()
        logger.debug(
            'Loaded word freq, char freq file: %s, size: %d, spend: %s s' %
            (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        # bert预训练模型
        t6 = time.time()
        self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab)
        self.MASK_TOKEN = "[MASK]"
        self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids(
            [self.MASK_TOKEN])[0]
        # Prepare model
        self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
        logger.debug("Loaded model ok, path: %s, spend: %.3f s." %
                     (self.bert_model_dir, time.time() - t6))
        self.initialized_detector = True
Exemple #19
0
def decode():
    # Prepare NLC data.
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.Model(FLAGS.lmfile)
    else:
        print('No lmfile, better to add kenlm arpa data file')

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer())
    vocab, reverse_vocab = initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        while True:
            sent = input("Enter a sentence: ")

            output_sent = fix_sent(model, sess, sent)

            print("Candidate: ", output_sent)
Exemple #20
0
 def __init__(self,
              lm_path,
              labels,
              blank_index=0,
              k=5,
              alpha=0.3,
              beta=5,
              prune=1e-3):
     """
     Args:
         lm_path (str): The path to the kenlm language model.
         labels (list(str)): A list of the characters.
         blank_index (int): The index of the blank character in the `labels` parameter.
         k (int): The beam width. Will keep the 'k' most likely candidates at each timestep.
         alpha (float): The language model weight. Should usually be between 0 and 1.
         beta (float): The language model compensation term. The higher the 'alpha', the higher the 'beta'.
         prune (float): Only extend prefixes with chars with an emission probability higher than 'prune'.
     """
     super(PrefixBeamSearchLMDecoder, self).__init__(labels, blank_index)
     if lm_path:
         import kenlm
         self.lm = kenlm.Model(lm_path)
         self.lm_weigh = lambda f: 10**(self.lm.score(f))
     else:
         self.lm_weigh = lambda s: 1
     self.k = k
     self.alpha = alpha
     self.beta = beta
     self.prune = prune
Exemple #21
0
def main():
    model = kenlm.Model(MODELNAME)
    text = read(FILENAME)
    with open(RESULT, 'w') as o:
        for t in text:
            o.writelines(t + " " + str(model.score(t, bos=False, eos=False)) +
                         "\n")
Exemple #22
0
    def __init__(self, form, config):

        self.structureDict = {
            'sonnet': ('a', 'b', 'b', 'a', '', 'c', 'd', 'd', 'c', '', 'e',
                       'f', 'e', '', 'f', 'e', 'f'),
            'short': ('a', 'b', 'a', 'b', '', 'c', 'd', 'c', 'd'),
            'shorter': ('a', 'b', 'a', 'b'),
            'pantoum': ('a', 'b', 'c', 'd', '', 'b', 'e', 'd', 'f', '', 'e',
                        'g', 'f', 'h', '', 'g', 'a', 'h', 'c'),
            'flat': ('a', 'a', '', 'b', 'b', '', 'c', 'c', '', 'd', 'd'),
        }

        self.form = form

        self.initializeConfig(config)
        self.loadRhymeDictionary()
        self.loadNMFData()

        self.generator = VerseGenerator(self.MODEL_FILE,
                                        self.entropy_threshold)

        self.loadVocabulary()

        self.ngramModel = kenlm.Model(self.NGRAM_FILE)

        if not os.path.exists('log'):
            os.makedirs('log')
        logfile = 'log/poem_' + datetime.now().strftime("%Y%m%d")
        self.log = open(logfile, 'a')
Exemple #23
0
def loadResources(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    # Language model built by KenLM: https://github.com/kpu/kenlm
    lm = kenlm.Model(args.model)
    # Load spaCy
    nlp = spacy.load("en")
    # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell
    # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower.
    gb = Hunspell("en_GB-large",
                  hunspell_data_dir=basename + '/resources/spelling/')
    # Inflection forms: http://wordlist.aspell.net/other/
    gb_infl = loadWordFormDict(basename +
                               "/resources/agid-2016.01.19/infl.txt")
    # List of common determiners
    det = {"", "the", "a", "an"}
    # List of common prepositions
    prep = {
        "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with"
    }
    # Save the above in a dictionary:
    res_dict = {
        "lm": lm,
        "nlp": nlp,
        "gb": gb,
        "gb_infl": gb_infl,
        "det": det,
        "prep": prep
    }
    return res_dict
Exemple #24
0
    def __init__(self, uid,
                 name,
                 order,
                 path,
                 bos,
                 eos):
        """
        A language model scorer (KenLM only).

        :param uid: unique id (int)
        :param name: prefix for features
        :param weights: weight vector (two features: logprob and oov count)
        :param order: n-gram order
        :param bos: a Terminal symbol representing the left boundary of the sentence.
        :param eos: a Terminal symbol representing the right boundary of the sentence.
        :param path: path to a kenlm model (ARPA or binary).
        :return:
        """
        super(StatelessLM, self).__init__(uid, name)
        self._order = order
        self._bos = bos
        self._eos = eos
        self._path = path
        self._model = klm.Model(path)
        self._features = (name, '{0}_OOV'.format(name))

        # get the initial state
        self._initial = klm.State()
        self._model.BeginSentenceWrite(self._initial)
Exemple #25
0
    def train_ngram_lm(self, kenlm_path, data_path, output_path, n_gram):
        """
        Trains a modified Kneser-Ney n-gram KenLM from a text file.
        Creates a .arpa file to store n-grams.
        """
        import kenlm
        import subprocess

        # create .arpa and .bin file of n-grams
        curdir = os.path.abspath(os.path.curdir)
        cd_command = "cd " + os.path.join(kenlm_path, 'build')
        command_1 = "bin/lmplz -o {} <{} >{} &".format(
            str(n_gram), os.path.join(curdir, data_path),
            output_path + ".arpa")
        command_2 = "bin/build_binary -s {} {} &".format(
            output_path + ".arpa", output_path + ".bin")

        subprocess.getstatusoutput(cd_command + " && " +
                                   command_1)  # call without logging output
        subprocess.getstatusoutput(cd_command + " && " +
                                   command_2)  # call without logging output

        # create language model
        assert output_path + ".bin"  # captured by try..except block outside
        model = kenlm.Model(output_path + ".bin")

        return model
Exemple #26
0
    def train_ngram_lm(self, kenlm_path, data_path, output_path, n_gram):
        """
        Trains a modified Kneser-Ney n-gram KenLM from a text file.
        Creates a .arpa file to store n-grams.
        """
        import kenlm
        import subprocess

        # create .arpa and .bin file of n-grams
        curdir = os.path.abspath(os.path.curdir)
        cd_command = "cd " + os.path.join(kenlm_path, 'build')
        command_1 = "bin/lmplz -o {} <{} >{} --discount_fallback &".format(
            str(n_gram), os.path.join(curdir, data_path), output_path)
        command_2 = "bin/build_binary -s {} {} &".format(
            output_path, output_path + ".bin")

        while True:
            subprocess.getstatusoutput(
                cd_command + " && " + command_1)  # call without logging output
            subprocess.getstatusoutput(
                cd_command + " && " + command_2)  # call without logging output
            if os.path.exists(output_path + ".bin"):
                break

        # create language model
        model = kenlm.Model(output_path + ".bin")

        return model
    def __init__(self,
                 acc_path='pushkin_cvetaeva_new.bin',
                 ppl_path='poets_5_kenlm.binary',
                 pos_label='__label__positive'):
        resource_package = __name__

        yelp_acc_path = acc_path
        yelp_ppl_path = ppl_path
        #yelp_ref0_path = 'yelp.refs.0'
        #yelp_ref1_path = 'yelp.refs.1'

        yelp_acc_file = pkg_resources.resource_stream(resource_package,
                                                      yelp_acc_path)
        yelp_ppl_file = pkg_resources.resource_stream(resource_package,
                                                      yelp_ppl_path)
        #yelp_ref0_file = pkg_resources.resource_stream(resource_package, yelp_ref0_path)
        #yelp_ref1_file = pkg_resources.resource_stream(resource_package, yelp_ref1_path)
        '''
        self.yelp_ref = []
        with open(yelp_ref0_file.name, 'r') as fin:
            self.yelp_ref.append(fin.readlines())
        with open(yelp_ref1_file.name, 'r') as fin:
            self.yelp_ref.append(fin.readlines())
        '''
        self.classifier_yelp = fasttext.load_model(yelp_acc_file.name)
        self.yelp_ppl_model = kenlm.Model(yelp_ppl_file.name)
        # name of positive label in fasttext classifier
        self.pos_label = pos_label
def generate_fir_sentence(topn=5, expend=3):
    fir, chars, head = user_input()
    candidate = []
    tmp = [[head[i]] for i in range(0, 4)]  # initialize
    for string in fir:
        tmp[0] = [ch for ch in string]
        candidate.append(copy.deepcopy(tmp))

    language_model = kenlm.Model("first.poem.lm")
    model.eval()

    for i in range(2, chars + 1):
        tmp = candidate[:]
        candidate = []
        for sen in tmp:
            if len(sen[0]) >= i:  # no need to produce
                candidate.append(sen)
            else:
                state = torch.zeros((1, feature_size), requires_grad=True)
                input_var = sentence_to_onehot(sen, chars)  # 20 * 1
                for k in range(1, i + 1):
                    out, state = model(input_var, state, 1, k)  # predict
                # state = torch.zeros((1, feature_size), requires_grad=True)
                # out, state = model(input_var, state, i+1, j)

                poss = out.data.reshape(-1).numpy().tolist()  # according to dl model
                get_top = []
                for _id, p in enumerate(poss):
                    get_top.append((_id, p))  # (id, possibility)
                get_top = sorted(get_top, key=lambda x: x[1], reverse=True)
                time = 0  # select top 2
                pt = 0
                while time < expend:
                    ch = id2char(get_top[pt][0])  # id to char
                    tmpflag = True
                    for each in sen:  # avoid duplicate
                        if ch in each:
                            tmpflag = False
                            break
                    if not tmpflag:
                        pt += 1
                        continue
                    sen[0].append(ch)
                    time += 1
                    pt += 1
                    candidate.append(copy.deepcopy(sen))
                    sen[0].pop()

    tmp = candidate[:]
    candidate = []
    for lines in tmp:
        if judge_fir_tonal_pattern(''.join(lines[0]), 5) >= 0:
            candidate.append(lines)
    score = []  # score after whole sentence
    for lines in candidate:
        score.append((lines, language_model.score(" ".join(lines[0]))))  # score the last sentence
    score = sorted(score, key=lambda x: x[1], reverse=True)
    score = score[0: min(topn, len(score))]
    candidate = [lines[0] for lines in score]
    return candidate
Exemple #29
0
def recognize(args):
    model, LFR_m, LFR_n = Transformer.load_model(args.model_path)
    print(model)
    model.eval()
    model.cuda()
    char_list, sos_id, eos_id = process_dict(args.dict)
    assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id

    # read json data
    with open(args.recog_json, 'rb') as f:
        js = json.load(f)['utts']
    # import Language Model
    lm_model = kenlm.Model(args.lm_path)
    # decode each utterance
    new_js = {}
    with torch.no_grad():
        for idx, name in enumerate(js.keys(), 1):
            print('(%d/%d) decoding %s' % (idx, len(js.keys()), name),
                  flush=True)
            input = kaldi_io.read_mat(js[name]['input'][0]['feat'])  # TxD
            input = build_LFR_features(input, LFR_m, LFR_n)
            input = torch.from_numpy(input).float()
            input_length = torch.tensor([input.size(0)], dtype=torch.int)
            input = input.cuda()
            input_length = input_length.cuda()
            nbest_hyps = model.recognize(input, input_length, char_list,
                                         lm_model, args)
            new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list)

    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_js
            }, indent=4, sort_keys=True).encode('utf_8'))
Exemple #30
0
def load_discourse_planning(db_names, dp_name, n=None):

    if dp_name == 'random':
        return random_dp_scorer

    if dp_name == 'markov':

        db_names_id = '_'.join(sorted(db_names))

        lm_filename = f'dp_lm_model_{n}_{db_names_id}.arpa'
        lm_filepath = os.path.join(PRETRAINED_DIR, lm_filename)

        model = kenlm.Model(lm_filepath)

        def scorer(triples_list):

            scores = []
            for triples in triples_list:
                pred_text = preprocess_to_dp_model(triples)
                score = model.score(pred_text)
                scores.append(score)

            return scores

        return scorer