def main(argv): del argv model1 = kenlm.LanguageModel(FLAGS.model1) model2 = kenlm.LanguageModel(FLAGS.model2) N = 0 res = 0. transform_method = get_method(FLAGS.identifier) with open(FLAGS.test_file, "r") as ifile: for line in ifile: words = line[:-1].split(" ") transformed_words = [apply_methods(w, [transform_method]) for w in words] transformed_line = " ".join(transformed_words) transformed_words = ["<s>"] + transformed_words + ["</s>"] full_words = [apply_methods(w, [transform_method, get_word]) for w in words] for i, (prob, length, oov) in enumerate(model1.full_scores(transformed_line)): bigram = transformed_words[i+2-length:i+2] # if len(bigram) < 2: # # if bigram[0] == "<s>": # # res += prob # # N+=1 # continue # print(f"Bigram {i}: {bigram}") N+=1 res += prob if bigram[-1] == "</s>" or oov or bigram[-1] in string.punctuation: continue full_word = full_words[i] # print(f"Fullword {i}: {full_word}") full_scores = model2.full_scores(full_word) res += list(full_scores)[1][0] print(f"Perplexity {FLAGS.model1} FLAGS.model1 {FLAGS.model2}: {10**(-res/N)}")
def __init__(self, labels=['hin', 'eng'], transliteration=False): self.flag = transliteration self.labels = labels self.wxp = wxilp(order="wx2utf") path = os.path.abspath(__file__).rpartition('/')[0] self.tag_dct = {tag: i for i, tag in enumerate(labels)} self.tree, self.queue, self.blm_wp, self.blm_sp = list(), list(), list( ), list() # load decision trees for tag in self.labels: if tag == "eng": self.tree.append("_") continue if tag == "kan": tag = "mal" with open('%s/decision_trees/eng-%s.json' % (path, tag)) as fp: self.tree.append(json.load(fp)) # load language-models for tag in self.labels: self.blm_wp.append( kenlm.LanguageModel('{}/blm_models/{}.tk.blm'.format( path, tag))) self.blm_sp.append( kenlm.LanguageModel('{}/blm_models/{}.ts.blm'.format( path, tag))) # load emoticon set with open('%s/extras/emoticons.txt' % path) as fp: self.emoticons = set(fp.read().split('\t')) self.reg = re.compile( r"(^[^a-zA-Z0-9]+|[^-'a-zA-Z0-9]+|[^a-zA-Z0-9]+$)")
def main(): if sys.argv[2] == "w": read_file = 'postExtract/wUmm/sample_'+sys.argv[1]+'.csv' elif sys.argv[2] == "wo": read_file = 'postExtract/woUmm/sample_'+sys.argv[1]+'.csv' with open(read_file, 'r') as csv_file_r: csv_file_w = open('umm_kenlm_output_'+sys.argv[1]+'.csv', 'w') reader = csv.DictReader(csv_file_r) fieldnames = ['filename', 'author', 'subreddit', 'title', 'lexicalType', 'lexicalItem', 'lexicalLength', 'lexicalIndex', 'originalText', 'text', 'sentLength', 'timestamp', 'sentScore', 'fullScores'] writer = csv.DictWriter(csv_file_w, fieldnames=fieldnames) writer.writeheader() if sys.argv[3] == "crawl": model = kenlm.LanguageModel('../en.trie') elif sys.argv[3] == "reddit": model = kenlm.LanguageModel('../kenlm/build/reddit.binary') #sentence = 'this is a sentence .' #print(model.score(sentence)) for r in reader: if r['filename'] not in filenames: filenames[r['filename']] = [r['filename']] print(r['filename']) score = model.score(r['text']) scoreArr = [] for j, (prob, length, oov) in enumerate(model.full_scores(r['text'])): scoreArr.append(prob) writer.writerow({'filename':r['filename'], 'author':r['author'], 'subreddit':r['subreddit'], 'title':r['title'], 'lexicalType':r['lexicalType'], 'lexicalItem':r['lexicalItem'], 'lexicalLength':r['lexicalLength'], 'lexicalIndex':r['lexicalIndex'], 'originalText':r['originalText'], 'text':r['text'], 'sentLength':r['sentLength'], 'timestamp':r['timestamp'], 'sentScore':score, 'fullScores':scoreArr}) csv_file_w.close() csv_file_r.close() print("Model Run Time: " + str(model_train_time) + " seconds") print("test_kenlm.py Run Time: " + str(time.time()-model_train_end_time) + " seconds")
def __init__(self, model_flag=0): self.model_flag = model_flag if model_flag == 0: self._broad_model = kenlm.LanguageModel("./model/gigaword_broad.bin") self._deep_model = kenlm.LanguageModel("./model/gigaword_deep.bin") else: self._deep_model = kenlm.LanguageModel("./model/ret_tuple_deep.bin") self._broad_model = kenlm.LanguageModel("./model/ret_tuple_broad.bin")
def __init__(self): self.count_1gram = {} self.count_2gram = {} self.count_3gram = {} self.count_4gram = {} self.count_5gram = {} self.KenLM_1gram ='' #kenlm.LanguageModel(KenLM_path+'CLOTH_NUM_N1.arpa') self.KenLM_2gram = kenlm.LanguageModel(KenLM_path+'CLOTH_NUM_N2.arpa') self.KenLM_3gram = kenlm.LanguageModel(KenLM_path+'CLOTH_NUM_N3.arpa') self.KenLM_4gram = kenlm.LanguageModel(KenLM_path+'CLOTH_NUM_N4.arpa') self.KenLM_5gram = kenlm.LanguageModel(KenLM_path+'CLOTH_NUM_N5.arpa')
def __init__(self, lid, htrans=None, etrans=None, wx=False): self.ed = enchant.Dict('en') self.hblm = kenlm.LanguageModel('lm/hindi-n3-p5-lmplz.blm') self.eblm = kenlm.LanguageModel('lm/english-n3-p10-lmplz.blm') self.so_dec_eng = so_viterbi(self.eblm) self.so_dec_hin = so_viterbi(self.hblm) self.e2h = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/ENG2HIN12M.dict')} self.h2e = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/HIN2ENG12M.dict')} self.meta = Meta() self.lid = LID(model=lid, etrans=etrans, htrans=htrans) self.wx = wx if not self.wx: self.wxc = WXC(order='wx2utf', lang='hin')
def retrieve_models(self): for gl in self.level_sentences['train'].keys(): try: self.models[gl] = kenlm.LanguageModel( self.path_to_arpa + '/gl_{}_n{}.arpa'.format(gl, self.n)) except OSError: print('Grade Level {} failed'.format(gl))
def __init__(self, alpha, beta, model_path, oov_weight=1): self.alpha = alpha self.beta = beta self.oov_weight = oov_weight if not os.path.isfile(model_path): raise IOError("Invaid language model path: %s" % model_path) self.lm = kenlm.LanguageModel(model_path)
def load_language_model(subreddit, start_year, start_month, end_month, ngrams, text_min, text_max, base_path): """ Loads and returns the language model that :param subreddit_list: list of subreddits :param start_year: year :param start_month: month :param end_month: month :param ngrams: the n of ngrams :param text_min: minimum text length to warrant inclusion :param text_max: maximum text length :param base_path: the project directory. always the same. :return: """ language_model_base_path = base_path + "language_models/" # TODO: make this global file_name = "{}_{}_{}_{}_{}_{}_{}.klm".format(subreddit, start_year, start_month, end_month, ngrams, text_min, text_max) file_path = language_model_base_path + file_name print file_path if not os.path.isfile(file_path): raise ValueError("the language model has not been created") file_path = language_model_base_path + file_name model = kenlm.LanguageModel(file_path) return model
def __init__(self): """load数据""" # self.config = config self.sim_dict = pickle.load( open('/home/peng.qiu/nlc-master/dataset/simp_simplified.pickle', 'rb')) _, self.vocab_to_int = get_config_from_json( 'configs/vocab_to_int.json') self.w2v_model = Word2Vec.load( '/data2/pengqiu/LM_data/w2v_news_size150.bin') # self.w2v_vocab = self.w2v_model.wv.vocab self.w2v_vocab = {} #出现次数少于50的都忽略 for item, value in self.w2v_model.wv.vocab.items(): if value.count > 40: self.w2v_vocab[item] = value self.thu = thulac.thulac( seg_only=False, user_dict='/data2/pengqiu/LM_data/cut_dict_2.txt') self.first_name = [ i.strip() for i in open('/home/peng.qiu/RNNLM_Project/configs/firstname.txt', 'r').readlines() ] # self.user_dict = [i.strip() for i in open('/data2/pengqiu/LM_data/cut_dict_2.txt').readlines()] self.hot_list = [ i.strip() for i in open( '/data2/pengqiu/LM_data/singer_dict.txt').readlines() ] self.casual_word = [ i.strip() for i in open( '/home/peng.qiu/RNNLM_Project/casual_word.txt').readlines() ] # self.lm = load_n_gram_model() self.lm = kenlm.LanguageModel('/data2/pengqiu/mix6_10.lm')
def _load(self): print("Loading model", self.name, '...', file=sys.stderr, end='') self.model = kenlm.LanguageModel(self.model_file) print(" reading raw ARPA data ... ", file=sys.stderr, end='') self.id2str, self.unigram_probs, bigrams = get_arpa_data(self.arpa_file) self.is_special = np.zeros(len(self.id2str), dtype=bool) for i, word in enumerate(self.id2str): assert self.model.vocab_index(word) == i, i if word[0] not in string.ascii_lowercase: self.is_special[i] = True # Since we give rare-word bonuses, count special words as super-common. self.unigram_probs_wordsonly = self.unigram_probs.copy() self.unigram_probs_wordsonly[self.is_special] = 0 # ... but for finding the most common fallback words, count special words as impossible. unigram_probs_wordsonly_2 = self.unigram_probs.copy() unigram_probs_wordsonly_2[self.is_special] = -np.inf self.most_common_words_by_idx = np.argsort(unigram_probs_wordsonly_2)[-500:] print(" Encoding bigrams to indices... ", file=sys.stderr, end='') self.unfiltered_bigrams, self.filtered_bigrams = encode_bigrams(bigrams, self.model) # Vocab trie self.vocab_trie = datrie.BaseTrie(set(itertools.chain.from_iterable(self.id2str))) for i, s in enumerate(self.id2str): self.vocab_trie[s] = i self.eos_idx = self.model.vocab_index('</S>') self.eop_idx = self.model.vocab_index('</s>') print("Loaded.", file=sys.stderr)
def decode(): # Prepare NLC data. global reverse_vocab, vocab, lm if FLAGS.lmfile is not None: print("Loading Language model from %s" % FLAGS.lmfile) lm = kenlm.LanguageModel(FLAGS.lmfile) print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) config = tf.ConfigProto( device_count={'GPU': 0} ) with tf.Session(config=config) as sess: print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) while True: sent = input("Enter a sentence: ") output_sent = fix_sent(model, sess, sent) print("Candidate: ", output_sent)
def __init__(self, sick_path, target_directory, lm_path=None, wsd_algorithm='cosine', sampling_parameter=0.5, min_substitutions=2, num_candidates=5, concatenate_corpora=True): self.sick_path = sick_path self.target_directory = target_directory self.lm_path = lm_path self.wsd_algorithm = wsd_algorithm self.sampling_parameter = sampling_parameter self.min_substitutions = min_substitutions self.num_candidates = num_candidates self.concatenate_corpora = concatenate_corpora self.filtered_path = os.path.join(self.target_directory, 'filtered_sick.txt') self.noscore_path = os.path.join(self.target_directory, 'noscore_sick.txt') # Filter the original SICK corpus to match the expected format, and create file for LM training if not os.path.exists(self.filtered_path) or not os.path.exists( self.noscore_path): self.filter_sick() if self.lm_path is None: raise ValueError( 'No language model provided! Use the noscore_sick corpus to train an .klm LM, first.' ) else: self.language_model = kenlm.LanguageModel(self.lm_path)
def main(): if len(sys.argv) != 4: print("usage: python2 get_score.py klm file option") return klm = sys.argv[1] path = sys.argv[2] option = sys.argv[3] # 1 = score | 2 = #OOV with open(path) as file: lines = file.readlines() # lines = [line.strip() for line in lines] lines = [line.strip('.').strip().lower() for line in lines] model = kenlm.LanguageModel(klm) # model.score computes # log10 p(sentence </s> | <s>) ln10 = math.log(10) # math.log is base e if option == '1': for line in lines: n = len(line.split()) + 1 # for </s> score = model.score(line) * ln10 / n print(score) elif option == '2': for line in lines: oov_count = 0 words = line.split() for word in words: if word not in model: oov_count +=1 print(oov_count)
def __init__(self, lm, dictfile, ctxmodel=None, dictinit=''): self.lm = kenlm.LanguageModel(lm) self.voc = [] self._vocid = LRUCache(64) self.ctx = pickle.load(open(ctxmodel, 'rb')) if ctxmodel else {} self.stopfn = lambda s: len(s) > 40 or len(s) > 3 and all( i == s[-1] for i in s[-3:]) self.loaddict(dictfile, dictinit, True)
def __init__(self, lang, segmenter): self.lang = lang self.tm = pickle.load(open('assets/wikitweetweb.' + lang + '.tm')) cnf = kenlm.Config() cnf.load_method = 0 self.lm = kenlm.LanguageModel('assets/wikitweetweb.' + lang + '.bin', cnf) self.segmenter = segmenter
def configure(self, config): if 'model' not in config: raise Exception( 'Perhaps you forgot to configure {0}.model in your chisel.ini file?' .format(self.sid)) self._model = kenlm.LanguageModel(config['model']) if 'name' in config: self._name = config['name']
def __init__(self, path, vocabulary): self.lm = kenlm.LanguageModel(path) self.vocabulary = vocabulary self.K = len(vocabulary) self.count = numpy.zeros(self.K) self.probs = numpy.zeros(self.K) for k in xrange(self.K): self.probs[k] = self.get_prob(k)
def main(): model = kenlm.LanguageModel("../data/language_model.klm") paraphrase_dict = get_paraphrase_dict() print "loaded: ", len(paraphrase_dict), " dictionary entries" sentence = "in november he spent parts of 14 days in florida, including a break for thanksgiving" [score_sent, length_sent, inter_sent] = compress_sentence(model, sentence, paraphrase_dict, True)
def get_compressed_sentence(sentence, paraphrase_dict, model=None): if model == None: model = kenlm.LanguageModel("../data/language_model.klm") [score_sent, length_sent, inter_sent] = compress_sentence(model, sentence, paraphrase_dict) return [score_sent, length_sent, inter_sent]
def generateSummaries(sentences, length=100, mode="Extractive", ranker=rankingModes['TR']): ''' This is where the ILP works to select the best sentences and form the summary ''' if mode == "Abstractive": import kenlm lm = kenlm.LanguageModel(RESOURCES_DIR + '/lm-3g.klm') ''' Here sentences should have POS tagged format ''' taggedsentences = [] for sent in sentences: sent = sent.decode('utf-8', 'ignore') tagged_sent = '' tagged_tokens = nltk.pos_tag(nltk.word_tokenize(sent)) for token in tagged_tokens: word, pos = token tagged_sent = tagged_sent + ' ' + word + "/" + pos taggedsentences.append(tagged_sent.strip()) sentences = bigramTweetGenerator(taggedsentences) genSentences, svolist = wg.retrieveNewSentences(sentences, stopwords) if len(genSentences) <= 1: return [k for k, v in genSentences] finalSentencesRetained = wg.solveILPFactBased(genSentences, lm, stopwords, ranker, intraGenSimThreshold=0.5, l_max=length, mode="Abstractive") summary = txtFromSents(finalSentencesRetained) print "=======Summary:===== \n", summary if mode == "Extractive": lm = [] #No need of language model in Extractive #if len(sentences) <= 2: # summary=txtFromSents(sentences) # print "Summary: ", summary # return print sentences finalSentencesRetained = wg.solveILPFactBased(sentences, lm, stopwords, ranker, intraGenSimThreshold=0.7, l_max=length, mode="Extractive") print 'Final sentences,', finalSentencesRetained summary = txtFromSents(finalSentencesRetained) print "=======Summary:===== \n", summary
def main(): parser = argparse.ArgumentParser('LM main') parser.add_argument('--corpus', type=str, default='tiny_corpus.txt', help='corpus to train') parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--emb_dim', type=int, default=128) parser.add_argument('--num_layers', type=int, default=1) parser.add_argument('--drop', type=float, default=0.1) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--lr', type=float, default=0.1) parser.add_argument('--momentum', type=float, default=.99) parser.add_argument('--clip_norm', type=float, default=5) parser.add_argument('--epochs', type=int, default=100) parser.add_argument('--device', type=str, default='cpu') parser.add_argument('--save', type=str, default='model.pt') parser.add_argument('--load', type=str, default=None) parser.add_argument('--arpa', type=str, default='tiny_corpus.arpa') args = parser.parse_args() corpus = Corpus(args.corpus) loader = CorpusLoader(corpus, args.batch_size, True, args.num_workers) if args.load is None: extractor = EmbeddingExtractor(corpus.vocab, args.emb_dim, args.device) network = Network(extractor, args.num_layers, drop=args.drop).to(args.device) else: network = torch.load(args.load, map_location=args.device) network.extractor.device = args.device network.rnn.flatten_parameters() ken_lm = kenlm.LanguageModel(args.arpa) optimizer = torch.optim.SGD(network.parameters(), args.lr, args.momentum) loss_fn = torch.nn.CrossEntropyLoss(reduction='none') scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, verbose=True, factor=.5) min_loss = float('inf') for epoch in range(args.epochs): pred = generate(network, device=args.device) gen_sentence = ' '.join(pred) ppl = ken_lm.perplexity(gen_sentence) print('%s\nPPL:\t%f' % (gen_sentence, ppl)) loss = single_epoch(network, loader, optimizer, loss_fn, args.clip_norm) print('epochs %d \t loss %.3f' % (epoch, loss)) scheduler.step(loss) if min_loss > loss: min_loss = loss print('saving to %s' % args.save) torch.save(network, args.save) print()
def __init__(self, psycho_path=None, lm_books_path=None, lm_news_path=None, embedding_model_path=None): """Docstring""" if psycho_path: self.df = pd.read_csv(psycho_path, sep='\t') self.df_mean = self.df.mean(axis=0) if lm_books_path: self.lm_books = kenlm.LanguageModel(lm_books_path) if lm_news_path: self.lm_news = kenlm.LanguageModel(lm_news_path) if embedding_model_path: self.embeddings = KeyedVectors.load_word2vec_format( embedding_model_path, binary=True) self.syllables = pyphen.Pyphen(lang='en')
def __init__(self, kenlm_dir, char_list, oov_penalty=1.0, open_vocab=True): super(NgramCharacterKenLM, self).__init__() self.model = kenlm.LanguageModel(kenlm_dir) self.order = self.model.order self.eos = len(char_list) - 1 self.word_unk = 1 self.log_oov_penalty = math.log(oov_penalty) self.open_vocab = open_vocab self.char_dict_size = len(char_list) self.normalized = True
def __init__(self, lm_file: str, use_log=True): self._pickle = lm_file.endswith('p') self.use_log = use_log if lm_file.endswith('.p'): lm = pickle.load(open(lm_file, 'rb')) self.model = defaultdict(lambda: 1e-11, lm) elif lm_file.endswith('.arpa'): self.model = kenlm.LanguageModel(lm_file) else: raise Exception('Invalid Lanuage Model File')
def index(text): valid = train.valid corp = train.corp models = map(lambda code: kenlm.LanguageModel('lm/' + code + ".binary"), corp) langs = dict(valid) text = text.lower() results = train.language(models, text) persistent = results return "(" + langs[results[0]] + "," + str(results[1]) + ")"
def __init__(self, lm_paths): """Extracts language model features. Parameters ---------- lm_paths : List List of language model paths """ self.models = [kenlm.LanguageModel(lm) for lm in lm_paths]
def score_generated_sentences(generated_text_file_path, language_model_path): log_probs = list() import kenlm model = kenlm.LanguageModel(language_model_path) with open(generated_text_file_path) as generated_text_file: for sentence in generated_text_file: log_probs.append(model.score(sentence)) return statistics.mean(log_probs)
def _test_log_s(sentences, sos, eos): lm_me = arpa.loadf(TEST_ARPA)[0] lm_ken = kenlm.LanguageModel(TEST_ARPA) results_me = [] results_ken = [] for sentence in sentences: score_me = lm_me.log_s(sentence, sos=sos, eos=eos) score_ken = lm_ken.score(sentence, bool(sos), bool(eos)) results_me.append(score_me) results_ken.append(score_ken) assert all(round(m - k, 2) == 0 for m, k in zip(results_me, results_ken))
def __init__(self, ngram_model, token_list): """Initialize Ngrambase. Args: ngram_model: ngram model path token_list: token list from dict or model.json """ self.chardict = [x if x != "<eos>" else "</s>" for x in token_list] self.charlen = len(self.chardict) self.lm = kenlm.LanguageModel(ngram_model) self.tmpkenlmstate = kenlm.State()