def initialize_detector(self): t1 = time.time() try: import kenlm self.lm = kenlm.Model(self.language_model_path) logger.debug('Loaded language model: %s, spend: %s s' % (self.language_model_path, str(time.time() - t1))) t1 = time.time() self.lm_word = kenlm.Model(self.language_word_model_path) logger.debug( 'Loaded language model: %s, spend: %s s' % (self.language_word_model_path, str(time.time() - t1))) except Exception: raise ImportError( 'mypycorrector dependencies are not fully installed, ' 'they are required for statistical language model.' 'Please use "pip install kenlm" to install it.' 'if you are Win, Please install kenlm in cgwin.') t1 = time.time() # # 同音词 self.word_similar = readjson(config.word_similar_path) # 词、频数dict self.char_freq = self.load_word_freq_dict(config.char_freq_path) self.custom_word_freq = {} detector_dict = DetectorDict() detector_dict.check_detector_dict_initialized() self.jieba_tokenizer = detector_dict.tokenizer self.word_freq = detector_dict.word_freq logger.debug('Loaded file: %s, spend: %s s' % (config.word_freq_path, str(time.time() - t1))) self.initialized_detector = True
def __init__(self, resources, n=5): self.gigaword_char = kenlm.Model(resources["gigaword_char"]) self.gigaword_word = kenlm.Model(resources["gigaword_word"]) self.stackoverflow_char = kenlm.Model(resources["stackoverflow_char"]) self.stackoverflow_word = kenlm.Model(resources["stackoverflow_word"]) self.N = n self.binner = GaussianBinner(100)
def main(): global rrp trainArticles = pd.preprocessData("balancedTrainingData.dat", "balancedTrainingDataLabels.dat") perplexity_true = [] perplexity_fake = [] perplexity = [] tri_model = kenlm.Model('fake_pos_2g.binary') quad_model = kenlm.Model('fake_pos_3g.binary') five_model = kenlm.Model('fake_pos_4g.binary') ratioTriQuad = [] ratioTriFive = [] for article in trainArticles: num_sentences = article.numberOfSentences tri_score = 0 quad_score = 0 five_score = 0 for sentence in article.allSentences: pos_tags = rrp.tag(sentence.string) words, tags = zip(*pos_tags) sentence.string = ' '.join(tags) tri_score += float(tri_model.perplexity(sentence.string)) quad_score += float(quad_model.perplexity(sentence.string)) five_score += float(five_model.perplexity(sentence.string)) tri_score = float(tri_score) / num_sentences quad_score = float(quad_score) / num_sentences five_score = float(five_score) / num_sentences ratioTriQuad.append(float(quad_score) / tri_score) ratioTriFive.append(float(five_score) / tri_score) # perplexity.append(score) # if article.label == 0: # perplexity_fake.append(score) # else: # perplexity_true.append(score) fp = open('ratioBiTri_pos_train.txt', 'w') for item in ratioTriQuad: fp.write(str(item)) fp.write('\n') fp.close() fp = open('ratioBiQuad_pos_train.txt', 'w') for item in ratioTriFive: fp.write(str(item)) fp.write('\n') fp.close()
def _load_lms(self, char_lm_dir, word_lm_dir): config = kenlm.Config() config.show_progress = False config.arpa_complain = kenlm.ARPALoadComplain.NONE for label in self._labels: char_lm_path = Path(char_lm_dir, '{}.arpa'.format(label)) word_lm_path = Path(word_lm_dir, '{}.arpa'.format(label)) self._char_lms[label] = kenlm.Model(str(char_lm_path), config) self._word_lms[label] = kenlm.Model(str(word_lm_path), config)
def __init__(self, data_dir, use_posthoc_correction=True): self.use_posthoc_correction = use_posthoc_correction self.data_dir = data_dir lm_title_abstracts = kenlm.Model( os.path.join(data_dir, 'titles_abstracts_lm.binary')) lm_authors = kenlm.Model(os.path.join(data_dir, 'authors_lm.binary')) lm_venues = kenlm.Model(os.path.join(data_dir, 'venues_lm.binary')) self.lms = (lm_title_abstracts, lm_authors, lm_venues) with open(os.path.join(data_dir, 'lightgbm_model.pickle'), 'rb') as f: self.model = pickle.load(f)
def __init__( self, cfg: UnpairedAudioTextConfig, source_dictionary=None, target_dictionary=None, ): super().__init__(cfg) self._target_dictionary = target_dictionary self._source_dictionary = source_dictionary self.num_symbols = ( len([s for s in target_dictionary.symbols if not s.startswith("madeup")]) - target_dictionary.nspecial ) self.sil_id = ( target_dictionary.index("<SIL>") if "<SIL>" in target_dictionary else -1 ) self.kenlm = None if cfg.kenlm_path is not None: import kenlm self.kenlm = kenlm.Model(cfg.kenlm_path) self.word_kenlm = None if cfg.word_kenlm_path is not None: import kenlm self.word_kenlm = kenlm.Model(cfg.word_kenlm_path) self.uppercase = cfg.uppercase self.skipwords = set(cfg.skipwords.split(",")) def str_postprocess(s): s = " ".join(w for w in s.split() if w not in self.skipwords) s = s.upper() if self.uppercase else s return s self.str_postprocess = str_postprocess self.compute_lm_score = lambda s: self.kenlm.score(self.str_postprocess(s)) self.compute_word_score = None if cfg.word_decoder_config is not None: self.kaldi_decoder = KaldiDecoder(cfg.word_decoder_config, beam=10) def compute_word_score(logits, padding): res = self.kaldi_decoder.decode(logits, padding) for r in res: r = r.result() assert len(r) == 1 r = r[0] yield r["score"], r["words"] self.compute_word_score = compute_word_score
def compute_features_for_the_train_data(train_data_file, features_save_data_file): vp_2gram_model = kenlm.Model(VP_2gram_LM) # nvp_2gram_model = kenlm.Model(NVP_2gram_LM) vp_3gram_model = kenlm.Model(VP_3gram_LM) # nvp_3gram_model = kenlm.Model(NVP_3gram_LM) with open(train_data_file, "r") as train_tsv, open(features_save_data_file, "w") as features_tsv: reader = csv.reader(train_tsv, delimiter='\t') writer = csv.writer(features_tsv, delimiter='\t') head_row = next(reader) new_head_row = head_row[0:-1] + feature_names new_head_row.append(head_row[-1]) writer.writerow(new_head_row) start_time = time.time() for i, row in enumerate(reader): question = row[0] answer = row[1] response = row[2] question = question.strip() question = mt.tokenize(question, return_str=True, escape=False) question = question.lower() answer = answer.strip() answer = answer.lower() response = response.strip() response = mt.tokenize(response, return_str=True, escape=False) response = response.lower() row[0] = question row[1] = answer row[2] = response count = int(row[-1]) if i % 1000 == 0: print(i, "time :", time.time() - start_time, "secs") start_time = time.time() # if count == 0: # p = random.uniform(0, 1) # if p > 0.05: # continue features = extract_features(question, answer, response, vp_2gram_model, vp_3gram_model) final_row = row[0:-1] + features # print(len(row), len(features), len(final_row)) final_row.append(row[-1]) # print(question, answer, response) # print(final_row[:8]) writer.writerow(final_row)
def __init__(self, dictionary: StaticDictionary, window=1, lm_file=None, *args, **kwargs): super().__init__(*args, **kwargs) self.costs = defaultdict(itertools.repeat(float('-inf')).__next__) self.dictionary = dictionary self.window = window if self.window == 0: self.find_candidates = self._find_candidates_window_0 else: self.find_candidates = self._find_candidates_window_n self.costs[('', '')] = log(1) self.costs[('⟬', '⟬')] = log(1) self.costs[('⟭', '⟭')] = log(1) for c in self.dictionary.alphabet: self.costs[(c, c)] = log(1) # if self.ser_path.is_file(): self.load() if lm_file: self.lm = kenlm.Model(str(expand_path(lm_file))) self.beam_size = 4 self.candidates_count = 4 self._infer_instance = self._infer_instance_lm
def main(): args = get_parser().parse_args() logger.debug(f"Args: {args}") ref_uid_to_tra = load_tra(args.ref_tra) hyp_uid_to_tra = load_tra(args.hyp_tra) assert not bool(set(hyp_uid_to_tra.keys()) - set(ref_uid_to_tra.keys())) lm = kenlm.Model(args.kenlm_path) skipwords = set(args.skipwords.split(",")) def compute_lm_score(s): s = " ".join(w for w in s.split() if w not in skipwords) s = s.upper() if args.uppercase else s return lm.score(s) g2p, g2p_dict = None, None if args.phonemize: if args.phonemize_lexicon: g2p_dict = load_lex(args.phonemize_lexicon) else: g2p = G2p() wer = compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p, g2p_dict) lm_ppl = compute_lm_ppl(hyp_uid_to_tra, compute_lm_score) gt_wer = -math.inf if args.gt_tra: gt_uid_to_tra = load_tra(args.gt_tra) gt_wer = compute_wer(gt_uid_to_tra, hyp_uid_to_tra, None, None) score = math.log(lm_ppl) * max(wer, args.min_vt_uer) logging.info(f"{args.hyp_tra}: score={score:.4f}; wer={wer*100:.2f}%; lm_ppl={lm_ppl:.4f}; gt_wer={gt_wer*100:.2f}%")
def load_sentence_aggregation(db_names, sa_name, n=None): if sa_name == 'random': def random_sa_scorer(sas): return get_random_scores(len(sas)) return random_sa_scorer if sa_name == 'markov': db_names_id = '_'.join(sorted(db_names)) lm_filename = f'sa_lm_model_{n}_{db_names_id}.arpa' lm_filepath = os.path.join(PRETRAINED_DIR, lm_filename) model = kenlm.Model(lm_filepath) def scorer(sas): # sas é uma lista de triplas particionadas # por triplas particionadas, quero dizer uma lista de listas de triplas # ex: [[t1, t2], [t3]] scores = [] for sa in sas: pred_text = preprocess_to_sa_model(sa) score = model.score(pred_text) scores.append(score) return scores return scorer
def main(doc2vec_file, lm_file, comp_pred_file, candidates_file, \ complex_file, weights, dress_file, diff, output_file): ## Get complex sentences print("Reading in complex sentences...") complex_sents = get_sents(complex_file) print(len(complex_sents)) print(complex_sents[0]) ## Get dress sentences print("Reading in DRESS sentences...") dress_sents = get_sents(dress_file) print(len(complex_sents)) print(complex_sents[0]) ## Gets candidate simplifications print("Reading in candidates...") sentences = get_simple_sents(candidates_file) print(len(sentences)) print(len(sentences[0])) ## Loads language model print("Loading kenlm model...") lm = kenlm.Model(lm_file) ## Get perplexity scores for each candidate sentence print("Calculating perplexities...") perplexities = get_perplexities(sentences, lm) ## Loads sentence complexity predictions print("Getting complexity predictions...") comp_preds = load_comp_preds(comp_pred_file, sentences) ## Loads doc2vec model print("Loading doc2vec model...") doc2vec = g.Doc2Vec.load(doc2vec_file) ## Gets embeddings for test sentences print("Getting complex embeddings...") start_alpha = 0.01 infer_epoch = 1000 complex_embeddings = get_complex_embeddings(complex_sents, doc2vec, start_alpha, infer_epoch) #complex_embeddings = [[0 for i in range(300)] for sent in complex_sents] ## Gets embeddings for each sentence print("Getting embeddings...") embeddings = get_embeddings(sentences, doc2vec, start_alpha, infer_epoch) #embeddings = [[[0 for i in range(300)] for sent in sents] for sents in sentences] ## Calculate cosine similarities between complex and simple sentences print("Calculating similarities...") similarities = get_sims(complex_embeddings, embeddings, sentences) #similarities = [[1 for i in range(len(s))] for s in sentences] print("Rerank sentences...") ## Reranks sentences based on average of fluency, relevancy, and simplicity top_sentences = rank_candidates(sentences, dress_sents, perplexities, comp_preds, similarities, weights, diff) save_sentences(top_sentences, output_file)
def train_ngram_lm(kenlm_path, data_path, output_path, N, dedup_data_path=None): """ Trains a modified Kneser-Ney n-gram KenLM from a text file. Creates a .arpa file to store n-grams. """ dedup_data_path = dedup_data_path or data_path # create .arpa file of n-grams curdir = os.path.abspath(os.path.curdir) # deduplicate_cmd = "cat {} | sort -u > {}".format( # os.path.join(curdir, data_path), os.path.join(curdir, dedup_data_path) # ) # os.system(deduplicate_cmd) estimate_cmd = "bin/lmplz -o {} --discount_fallback <{} >{} ".format( N, os.path.join(curdir, data_path), os.path.join(curdir, output_path) ) os.system("cd "+os.path.join(kenlm_path, 'build')+" && "+estimate_cmd) load_kenlm() # create language model model = kenlm.Model(os.path.join(curdir, output_path)) return model
def main(): trainArticles = pd.preprocessData("nltk_train.txt", "trainingSetLabels.dat") perplexity_true = [] perplexity_fake = [] perplexity = [] model = kenlm.Model('pos4g_fresh.arpa') for article in trainArticles: num_sentences = article.numberOfSentences score = 0 for sentence in article.allSentences: score += float(model.perplexity(sentence.string)) score = float(score) / num_sentences perplexity.append(score) if article.label == 0: perplexity_fake.append(score) else: perplexity_true.append(score) fp = open('pos_4_train.txt', 'w') for item in perplexity: fp.write(str(item)) fp.write('\n') fp.close()
def langModelFeat(self, argString, preprocessReq=0): ''' Extracts n-gram Language Model preplexity features. ''' ngramOrder = 3 langModel = 0 # Binary1/0,ngramOrder,LMFilePath(ifBinary1) arguments = argString.split(',') if (int(arguments[0])): # Use given langModel langModel = "\"{0}\"".format(arguments[-1]) ngramOrder = int(arguments[1]) if preprocessReq: # Request all preprocessing functions to be prepared if not langModel: langModel = self.preprocessor.buildLanguageModel(ngramOrder) self.preprocessor.getInputFileName() self.preprocessor.getBinariesPath() return 1 sentsFile = self.preprocessor.getInputFileName() srilmBinary, kenlm = self.preprocessor.getBinariesPath() if not langModel: langModel = self.preprocessor.buildLanguageModel(ngramOrder) if srilmBinary and not kenlm: pplFile = "tempLang{0}{1}.ppl".format(os.path.basename(sentsFile), ngramOrder) command = "\"{0}ngram\" -order {1} -lm {2} -ppl {3} -debug 1 -unk> {4}".format( srilmBinary, ngramOrder, langModel, sentsFile, pplFile) subprocess.call(command, shell=True) probab = self.extractValues(pplFile, self.preprocessor.getSentCount()) os.remove(pplFile) return sparse.lil_matrix(probab) else: try: __import__('imp').find_module('kenlm') import kenlm model = kenlm.Model(langModel) probab = [] for sent in self.preprocessor.getPlainSentences(): probab.append([ model.score(sent, bos=True, eos=True), model.perplexity(sent) ]) output = sparse.lil_matrix(probab) return output except ImportError: import pynlpl.lm.lm as pineApple arpaLM = pineApple.ARPALanguageModel(langModel) probab = [] for sent in self.preprocessor.gettokenizeSents(): probab.append([arpaLM.score(sent)]) output = sparse.lil_matrix(probab) return output
def main(): args = parse() model = kenlm.Model('bigram') perplexities = [] sample_size = 1000 with open( '/u/demorali/corpora/1g-word-lm-benchmark-r13output/heldout-monolingual.tokenized.shuffled/news.en-00000-of-00100', 'r') as f: for line in f: perplexities.append(model.perplexity(line)) if len(perplexities) == sample_size: break if args.csv is None: print("""Test sentence count: {size} mean is {mean} max is {max} min is {min} """.format(mean=mean(perplexities), max=max(perplexities), min=min(perplexities), size=sample_size)) else: with open(args.csv, 'a') as f: writer = csv.writer(f) writer.writerow( [mean(perplexities), max(perplexities), min(perplexities)])
def initialize_detector(self): t1 = time.time() self.lm = kenlm.Model(self.language_model_path) t2 = time.time() default_logger.debug('Loaded language model: %s, spend: %s s' % (self.language_model_path, str(t2 - t1))) # 词、频数dict self.word_freq = self.load_word_freq_dict(self.word_freq_path) t3 = time.time() default_logger.debug( 'Loaded word freq file: %s, size: %d, spend: %s s' % (self.word_freq_path, len(self.word_freq), str(t3 - t2))) # 自定义混淆集 self.custom_confusion = self._get_custom_confusion_dict( self.custom_confusion_path) t4 = time.time() default_logger.debug( 'Loaded confusion file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_confusion), str(t4 - t3))) # 自定义切词词典 self.custom_word_dict = self.load_word_freq_dict(self.custom_word_path) # 合并切词词典及自定义词典 self.word_freq.update(self.custom_word_dict) t5 = time.time() default_logger.debug( 'Loaded custom word file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_word_dict), str(t5 - t4))) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_dict, custom_confusion_dict=self.custom_confusion) t6 = time.time() default_logger.info('Loaded dict ok, spend: %s s' % str(t6 - t1)) self.initialized_detector = True
def klm_perplexity(string): model=kenlm.Model('kenlm/lm/test.arpa') #print(string) per=model.perplexity(string) # print(per) return(per)
def initialize_detector(self): t1 = time.time() try: import kenlm except ImportError: raise ImportError( 'mypycorrector dependencies are not fully installed, ' 'they are required for statistical language model.' 'Please use "pip install kenlm" to install it.' 'if you are Win, Please install kenlm in cgwin.') self.lm = kenlm.Model(self.language_model_path) logger.debug('Loaded language model: %s, spend: %s s' % (self.language_model_path, str(time.time() - t1))) # 词、频数dict t2 = time.time() self.word_freq = self.load_word_freq_dict(self.word_freq_path) self.char_freq = self.load_char_freq_dict(self.char_freq_path) t3 = time.time() logger.debug( 'Loaded word freq, char freq file: %s, size: %d, spend: %s s' % (self.word_freq_path, len(self.word_freq), str(t3 - t2))) # 自定义混淆集 self.custom_confusion = self._get_custom_confusion_dict( self.custom_confusion_path) t4 = time.time() logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_confusion), str(t4 - t3))) # 自定义切词词典 self.custom_word_freq = self.load_word_freq_dict( self.custom_word_freq_path) self.person_names = self.load_word_freq_dict(self.person_name_path) self.place_names = self.load_word_freq_dict(self.place_name_path) self.stopwords = self.load_word_freq_dict(self.stopwords_path) # 合并切词词典及自定义词典 self.custom_word_freq.update(self.person_names) self.custom_word_freq.update(self.place_names) self.custom_word_freq.update(self.stopwords) self.word_freq.update(self.custom_word_freq) t5 = time.time() logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' % (self.custom_confusion_path, len( self.custom_word_freq), str(t5 - t4))) self.tokenizer = Tokenizer(dict_path=self.word_freq_path, custom_word_freq_dict=self.custom_word_freq, custom_confusion_dict=self.custom_confusion) # bert预训练模型 t6 = time.time() self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab) self.MASK_TOKEN = "[MASK]" self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids( [self.MASK_TOKEN])[0] # Prepare model self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir) logger.debug("Loaded model ok, path: %s, spend: %.3f s." % (self.bert_model_dir, time.time() - t6)) self.initialized_detector = True
def decode(): # Prepare NLC data. global reverse_vocab, vocab, lm if FLAGS.lmfile is not None: print("Loading Language model from %s" % FLAGS.lmfile) lm = kenlm.Model(FLAGS.lmfile) else: print('No lmfile, better to add kenlm arpa data file') print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data( FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer()) vocab, reverse_vocab = initialize_vocabulary(vocab_path) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) with tf.Session() as sess: print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) while True: sent = input("Enter a sentence: ") output_sent = fix_sent(model, sess, sent) print("Candidate: ", output_sent)
def __init__(self, lm_path, labels, blank_index=0, k=5, alpha=0.3, beta=5, prune=1e-3): """ Args: lm_path (str): The path to the kenlm language model. labels (list(str)): A list of the characters. blank_index (int): The index of the blank character in the `labels` parameter. k (int): The beam width. Will keep the 'k' most likely candidates at each timestep. alpha (float): The language model weight. Should usually be between 0 and 1. beta (float): The language model compensation term. The higher the 'alpha', the higher the 'beta'. prune (float): Only extend prefixes with chars with an emission probability higher than 'prune'. """ super(PrefixBeamSearchLMDecoder, self).__init__(labels, blank_index) if lm_path: import kenlm self.lm = kenlm.Model(lm_path) self.lm_weigh = lambda f: 10**(self.lm.score(f)) else: self.lm_weigh = lambda s: 1 self.k = k self.alpha = alpha self.beta = beta self.prune = prune
def main(): model = kenlm.Model(MODELNAME) text = read(FILENAME) with open(RESULT, 'w') as o: for t in text: o.writelines(t + " " + str(model.score(t, bos=False, eos=False)) + "\n")
def __init__(self, form, config): self.structureDict = { 'sonnet': ('a', 'b', 'b', 'a', '', 'c', 'd', 'd', 'c', '', 'e', 'f', 'e', '', 'f', 'e', 'f'), 'short': ('a', 'b', 'a', 'b', '', 'c', 'd', 'c', 'd'), 'shorter': ('a', 'b', 'a', 'b'), 'pantoum': ('a', 'b', 'c', 'd', '', 'b', 'e', 'd', 'f', '', 'e', 'g', 'f', 'h', '', 'g', 'a', 'h', 'c'), 'flat': ('a', 'a', '', 'b', 'b', '', 'c', 'c', '', 'd', 'd'), } self.form = form self.initializeConfig(config) self.loadRhymeDictionary() self.loadNMFData() self.generator = VerseGenerator(self.MODEL_FILE, self.entropy_threshold) self.loadVocabulary() self.ngramModel = kenlm.Model(self.NGRAM_FILE) if not os.path.exists('log'): os.makedirs('log') logfile = 'log/poem_' + datetime.now().strftime("%Y%m%d") self.log = open(logfile, 'a')
def loadResources(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) # Language model built by KenLM: https://github.com/kpu/kenlm lm = kenlm.Model(args.model) # Load spaCy nlp = spacy.load("en") # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower. gb = Hunspell("en_GB-large", hunspell_data_dir=basename + '/resources/spelling/') # Inflection forms: http://wordlist.aspell.net/other/ gb_infl = loadWordFormDict(basename + "/resources/agid-2016.01.19/infl.txt") # List of common determiners det = {"", "the", "a", "an"} # List of common prepositions prep = { "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with" } # Save the above in a dictionary: res_dict = { "lm": lm, "nlp": nlp, "gb": gb, "gb_infl": gb_infl, "det": det, "prep": prep } return res_dict
def __init__(self, uid, name, order, path, bos, eos): """ A language model scorer (KenLM only). :param uid: unique id (int) :param name: prefix for features :param weights: weight vector (two features: logprob and oov count) :param order: n-gram order :param bos: a Terminal symbol representing the left boundary of the sentence. :param eos: a Terminal symbol representing the right boundary of the sentence. :param path: path to a kenlm model (ARPA or binary). :return: """ super(StatelessLM, self).__init__(uid, name) self._order = order self._bos = bos self._eos = eos self._path = path self._model = klm.Model(path) self._features = (name, '{0}_OOV'.format(name)) # get the initial state self._initial = klm.State() self._model.BeginSentenceWrite(self._initial)
def train_ngram_lm(self, kenlm_path, data_path, output_path, n_gram): """ Trains a modified Kneser-Ney n-gram KenLM from a text file. Creates a .arpa file to store n-grams. """ import kenlm import subprocess # create .arpa and .bin file of n-grams curdir = os.path.abspath(os.path.curdir) cd_command = "cd " + os.path.join(kenlm_path, 'build') command_1 = "bin/lmplz -o {} <{} >{} &".format( str(n_gram), os.path.join(curdir, data_path), output_path + ".arpa") command_2 = "bin/build_binary -s {} {} &".format( output_path + ".arpa", output_path + ".bin") subprocess.getstatusoutput(cd_command + " && " + command_1) # call without logging output subprocess.getstatusoutput(cd_command + " && " + command_2) # call without logging output # create language model assert output_path + ".bin" # captured by try..except block outside model = kenlm.Model(output_path + ".bin") return model
def train_ngram_lm(self, kenlm_path, data_path, output_path, n_gram): """ Trains a modified Kneser-Ney n-gram KenLM from a text file. Creates a .arpa file to store n-grams. """ import kenlm import subprocess # create .arpa and .bin file of n-grams curdir = os.path.abspath(os.path.curdir) cd_command = "cd " + os.path.join(kenlm_path, 'build') command_1 = "bin/lmplz -o {} <{} >{} --discount_fallback &".format( str(n_gram), os.path.join(curdir, data_path), output_path) command_2 = "bin/build_binary -s {} {} &".format( output_path, output_path + ".bin") while True: subprocess.getstatusoutput( cd_command + " && " + command_1) # call without logging output subprocess.getstatusoutput( cd_command + " && " + command_2) # call without logging output if os.path.exists(output_path + ".bin"): break # create language model model = kenlm.Model(output_path + ".bin") return model
def __init__(self, acc_path='pushkin_cvetaeva_new.bin', ppl_path='poets_5_kenlm.binary', pos_label='__label__positive'): resource_package = __name__ yelp_acc_path = acc_path yelp_ppl_path = ppl_path #yelp_ref0_path = 'yelp.refs.0' #yelp_ref1_path = 'yelp.refs.1' yelp_acc_file = pkg_resources.resource_stream(resource_package, yelp_acc_path) yelp_ppl_file = pkg_resources.resource_stream(resource_package, yelp_ppl_path) #yelp_ref0_file = pkg_resources.resource_stream(resource_package, yelp_ref0_path) #yelp_ref1_file = pkg_resources.resource_stream(resource_package, yelp_ref1_path) ''' self.yelp_ref = [] with open(yelp_ref0_file.name, 'r') as fin: self.yelp_ref.append(fin.readlines()) with open(yelp_ref1_file.name, 'r') as fin: self.yelp_ref.append(fin.readlines()) ''' self.classifier_yelp = fasttext.load_model(yelp_acc_file.name) self.yelp_ppl_model = kenlm.Model(yelp_ppl_file.name) # name of positive label in fasttext classifier self.pos_label = pos_label
def generate_fir_sentence(topn=5, expend=3): fir, chars, head = user_input() candidate = [] tmp = [[head[i]] for i in range(0, 4)] # initialize for string in fir: tmp[0] = [ch for ch in string] candidate.append(copy.deepcopy(tmp)) language_model = kenlm.Model("first.poem.lm") model.eval() for i in range(2, chars + 1): tmp = candidate[:] candidate = [] for sen in tmp: if len(sen[0]) >= i: # no need to produce candidate.append(sen) else: state = torch.zeros((1, feature_size), requires_grad=True) input_var = sentence_to_onehot(sen, chars) # 20 * 1 for k in range(1, i + 1): out, state = model(input_var, state, 1, k) # predict # state = torch.zeros((1, feature_size), requires_grad=True) # out, state = model(input_var, state, i+1, j) poss = out.data.reshape(-1).numpy().tolist() # according to dl model get_top = [] for _id, p in enumerate(poss): get_top.append((_id, p)) # (id, possibility) get_top = sorted(get_top, key=lambda x: x[1], reverse=True) time = 0 # select top 2 pt = 0 while time < expend: ch = id2char(get_top[pt][0]) # id to char tmpflag = True for each in sen: # avoid duplicate if ch in each: tmpflag = False break if not tmpflag: pt += 1 continue sen[0].append(ch) time += 1 pt += 1 candidate.append(copy.deepcopy(sen)) sen[0].pop() tmp = candidate[:] candidate = [] for lines in tmp: if judge_fir_tonal_pattern(''.join(lines[0]), 5) >= 0: candidate.append(lines) score = [] # score after whole sentence for lines in candidate: score.append((lines, language_model.score(" ".join(lines[0])))) # score the last sentence score = sorted(score, key=lambda x: x[1], reverse=True) score = score[0: min(topn, len(score))] candidate = [lines[0] for lines in score] return candidate
def recognize(args): model, LFR_m, LFR_n = Transformer.load_model(args.model_path) print(model) model.eval() model.cuda() char_list, sos_id, eos_id = process_dict(args.dict) assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] # import Language Model lm_model = kenlm.Model(args.lm_path) # decode each utterance new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): print('(%d/%d) decoding %s' % (idx, len(js.keys()), name), flush=True) input = kaldi_io.read_mat(js[name]['input'][0]['feat']) # TxD input = build_LFR_features(input, LFR_m, LFR_n) input = torch.from_numpy(input).float() input_length = torch.tensor([input.size(0)], dtype=torch.int) input = input.cuda() input_length = input_length.cuda() nbest_hyps = model.recognize(input, input_length, char_list, lm_model, args) new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list) with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, sort_keys=True).encode('utf_8'))
def load_discourse_planning(db_names, dp_name, n=None): if dp_name == 'random': return random_dp_scorer if dp_name == 'markov': db_names_id = '_'.join(sorted(db_names)) lm_filename = f'dp_lm_model_{n}_{db_names_id}.arpa' lm_filepath = os.path.join(PRETRAINED_DIR, lm_filename) model = kenlm.Model(lm_filepath) def scorer(triples_list): scores = [] for triples in triples_list: pred_text = preprocess_to_dp_model(triples) score = model.score(pred_text) scores.append(score) return scores return scorer