def langModelFeat(self, argString, preprocessReq=0): ''' Extracts n-gram Language Model preplexity features. ''' ngramOrder = 3 langModel = 0 # Binary1/0,ngramOrder,LMFilePath(ifBinary1) arguments = argString.split(',') if (int(arguments[0])): # Use given langModel langModel = "\"{0}\"".format(arguments[-1]) ngramOrder = int(arguments[1]) if preprocessReq: # Request all preprocessing functions to be prepared if not langModel: langModel = self.preprocessor.buildLanguageModel(ngramOrder) self.preprocessor.getInputFileName() self.preprocessor.getBinariesPath() return 1 sentsFile = self.preprocessor.getInputFileName() srilmBinary, kenlm = self.preprocessor.getBinariesPath() if not langModel: langModel = self.preprocessor.buildLanguageModel(ngramOrder) if srilmBinary and not kenlm: pplFile = "tempLang{0}{1}.ppl".format(os.path.basename(sentsFile), ngramOrder) command = "\"{0}ngram\" -order {1} -lm {2} -ppl {3} -debug 1 -unk> {4}".format( srilmBinary, ngramOrder, langModel, sentsFile, pplFile) subprocess.call(command, shell=True) probab = self.extractValues(pplFile, self.preprocessor.getSentCount()) os.remove(pplFile) return sparse.lil_matrix(probab) else: try: __import__('imp').find_module('kenlm') import kenlm model = kenlm.Model(langModel) probab = [] for sent in self.preprocessor.getPlainSentences(): probab.append([ model.score(sent, bos=True, eos=True), model.perplexity(sent) ]) output = sparse.lil_matrix(probab) return output except ImportError: import pynlpl.lm.lm as pineApple arpaLM = pineApple.ARPALanguageModel(langModel) probab = [] for sent in self.preprocessor.gettokenizeSents(): probab.append([arpaLM.score(sent)]) output = sparse.lil_matrix(probab) return output
def getPynlplScores(self, sentences, langModel): import pynlpl.lm.lm as pineApple arpaLM = pineApple.ARPALanguageModel(langModel) probab = [] print("Using Pineapple") for sent in sentences: probab.append([arpaLM.score(sent)]) return sparse.lil_matrix(probab)
def backwardLangModelPOSFeat(self, argString, preprocessReq=0): ''' Extracts n-gram POS language model preplexity features. ''' ngramOrder = 3 langModel = "" taggedInput = "" taggedCorpus = "" # TaggedInput1/0,LM0/1,taggedCorpus0/1,ngramOrder(,TaggedPOSfile(ifTaggedInp1), # LMFilePath(ifLM1),taggedCorpus(if LM0&TaggedCorpus1)) arguments = argString.split(',') if int(arguments[0]): # Use file of tagged sents (last argument) taggedInput = "\"{0}\"".format(arguments[4]) if int(arguments[1]): # Next argument langModel = "\"{0}\"".format(arguments[4 + int(arguments[0])]) elif int(arguments[2]): taggedCorpus = "\"{0}\"".format(arguments[4 + int(arguments[0])]) ngramOrder = int(arguments[3]) if preprocessReq: # Request all preprocessing functions to be prepared if not taggedInput: taggedInput = self.prep_servs.dumpTokensTofile( dumpFile="{0}_tagged_Input.txt".format( self.preprocessor.getInputFileName()), tokenSents=self.preprocessor.getPOStagged()) if not langModel: if not taggedCorpus: taggedCorpus = self.prep_servs.dumpTokensTofile( dumpFile="{0}_tagged_Corpus.txt".format( self.preprocessor.getCorpusLMName()), tokenSents=self.prep_servs.tagPOSfromFile( self.preprocessor.getCorpusLMName())) # If tagged corpus is empty, just use langModel = self.preprocessor.buildBackwardLanguageModel( ngramOrder, taggedCorpus, False) return 1 if not taggedInput: taggedInput = "{0}_tagged_Input.txt".format( self.preprocessor.getInputFileName()) if not langModel: if not taggedCorpus: taggedCorpus = "{0}_tagged_Corpus.txt".format( self.preprocessor.getCorpusLMName()) langModel = self.preprocessor.buildBackwardLanguageModel( ngramOrder, taggedCorpus, False) srilmBinary, kenlm = self.preprocessor.getBinariesPath() if srilmBinary and not kenlm: pplFile = "tempLang{0}_reversed_{1}.ppl".format( os.path.basename(taggedInput[1:-1]), ngramOrder) # strip the quotes from the taggedInput filename # so that os.path.basename works correctly command = "\"{0}ngram\" -order {1} -lm {2} -ppl {3} -debug 1 -unk -reverse > {4}".format( srilmBinary, ngramOrder, langModel, taggedInput, pplFile) subprocess.call(command, shell=True) probab = self.extractValues(pplFile, self.preprocessor.getSentCount()) os.remove(pplFile) return sparse.lil_matrix(probab) else: try: __import__('imp').find_module('kenlm') import kenlm model = kenlm.Model(langModel) probab = [] for sent in self.preprocessor.getPOStagged(): probab.append([ model.score(sent, bos=True, eos=True), model.perplexity(sent) ]) output = sparse.lil_matrix(probab) return output except ImportError: import pynlpl.lm.lm as pineApple arpaLM = pineApple.ARPALanguageModel(langModel) probab = [] for sent in self.preprocessor.getPOStagged(): probab.append([arpaLM.score(sent)]) output = sparse.lil_matrix(probab) return output
def __init__(self, bigramARPAFilname: str): self.bigrams = lm.ARPALanguageModel(bigramARPAFilname, base_e=False)
# gather_captions_to_text(caption_dir, out_fpath) # building 3-g LM # https://cmusphinx.github.io/wiki/tutoriallm/#training-an-arpa-model-with-srilm # $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram-count -kndiscount -interpolate -text ../clotho-dataset/lm/dev.txt -lm ../clotho-dataset/lm/dev.lm # $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram -lm ../clotho-dataset/lm/dev.lm -ppl ../clotho-dataset/lm/eva.txt # file ../clotho-dataset/lm/eva.txt: 5225 sentences, 64350 words, 0 OOVs # 0 zeroprobs, logprob= -114470.2 ppl= 44.18532 ppl1= 60.09924 # $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram -lm ../clotho-dataset/lm/dev.lm -prune 1e-8 -write-lm ../clotho-dataset/lm/dev_pruned.lm # $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram -lm ../clotho-dataset/lm/dev_pruned.lm -ppl ../clotho-dataset/lm/eva.txt # file ../clotho-dataset/lm/eva.txt: 5225 sentences, 64350 words, 0 OOVs # 0 zeroprobs, logprob= -114658.9 ppl= 44.46208 ppl1= 60.50634 # lm_path=b'../clotho-dataset/lm/dev.lm' # # lm = LM(lm_path, lower=False) # # # print(len([b"man", b"a"])) # print(lm.logprob_strings(lm, b"is", [b"man", b"a"])) lm = lm.ARPALanguageModel('../clotho-dataset/lm/dev.lm') print("man", lm.scoreword("man")) print("a man", lm.scoreword("man", history=("a",))) print("a hyman", lm.scoreword("hyman", history=("a",)))
def __init__(self, vocab_size, embedding_dim=128, decoder_hidden_size_1=128, decoder_hidden_size_2=128, query_size=128, value_size=128, key_size=128, isAttended=False, beam_size=10, use_lm_bigram=False, use_lm_trigram=False, lm_weight=0., teacher_forcing_ratio=0.9, word2index=None, index2word=None, vocab=None, device='cpu'): super(BeamDecoder, self).__init__() print("BeamDecoder VOCAB SIZE:", vocab_size) self.beam_size = beam_size self.use_lm_bigram = use_lm_bigram self.use_lm_trigram = use_lm_trigram if use_lm_bigram or use_lm_trigram: self.my_lm = lm_object.ARPALanguageModel(lm_dir + 'dev_pruned.lm') # self.my_lm = lm_object.ARPALanguageModel(lm_dir + 'dev.lm') # self.my_lm = lm_object.ARPALanguageModel(lm_dir + 'dev_eva.lm') self.lm_w = lm_weight self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2index['<eos>']) self.lstm1 = nn.LSTMCell(input_size=embedding_dim + value_size, hidden_size=decoder_hidden_size_1) self.lstm2 = nn.LSTMCell(input_size=decoder_hidden_size_1, hidden_size=decoder_hidden_size_2) # self.decoder_hidden_size_1 = decoder_hidden_size_1 # self.decoder_hidden_size_2 = decoder_hidden_size_2 self.query_size = query_size assert query_size == value_size and query_size == key_size, "ERROR: decoder, query_size!=key_size or query_size!=value_size" self.query_network = nn.Linear(decoder_hidden_size_2, query_size) # print("hidden size", hidden_dim) self.teacher_forcing_ratio = teacher_forcing_ratio self.gumbel_noise_weight = 1.0 # https://casmls.github.io/general/2017/02/01/GumbelSoftmax.html self.vocab_size = vocab_size self.word2index = word2index # print("word2index", word2index) self.index2word = index2word self.vocab = vocab self.DEVICE = device self.isAttended = isAttended if isAttended: self.attention = Attention() self.character_prob = nn.Linear(decoder_hidden_size_2 + query_size, vocab_size)