Ejemplo n.º 1
0
    def langModelFeat(self, argString, preprocessReq=0):
        '''
        Extracts n-gram Language Model preplexity features.
        '''
        ngramOrder = 3
        langModel = 0
        # Binary1/0,ngramOrder,LMFilePath(ifBinary1)
        arguments = argString.split(',')
        if (int(arguments[0])):
            # Use given langModel
            langModel = "\"{0}\"".format(arguments[-1])

        ngramOrder = int(arguments[1])

        if preprocessReq:
            # Request all preprocessing functions to be prepared
            if not langModel:
                langModel = self.preprocessor.buildLanguageModel(ngramOrder)
            self.preprocessor.getInputFileName()
            self.preprocessor.getBinariesPath()
            return 1

        sentsFile = self.preprocessor.getInputFileName()
        srilmBinary, kenlm = self.preprocessor.getBinariesPath()

        if not langModel:
            langModel = self.preprocessor.buildLanguageModel(ngramOrder)

        if srilmBinary and not kenlm:
            pplFile = "tempLang{0}{1}.ppl".format(os.path.basename(sentsFile),
                                                  ngramOrder)
            command = "\"{0}ngram\" -order {1} -lm {2} -ppl {3} -debug 1 -unk> {4}".format(
                srilmBinary, ngramOrder, langModel, sentsFile, pplFile)

            subprocess.call(command, shell=True)
            probab = self.extractValues(pplFile,
                                        self.preprocessor.getSentCount())
            os.remove(pplFile)
            return sparse.lil_matrix(probab)
        else:
            try:
                __import__('imp').find_module('kenlm')
                import kenlm
                model = kenlm.Model(langModel)
                probab = []
                for sent in self.preprocessor.getPlainSentences():
                    probab.append([
                        model.score(sent, bos=True, eos=True),
                        model.perplexity(sent)
                    ])
                output = sparse.lil_matrix(probab)
                return output
            except ImportError:
                import pynlpl.lm.lm as pineApple
                arpaLM = pineApple.ARPALanguageModel(langModel)
                probab = []
                for sent in self.preprocessor.gettokenizeSents():
                    probab.append([arpaLM.score(sent)])
                output = sparse.lil_matrix(probab)
                return output
Ejemplo n.º 2
0
    def getPynlplScores(self, sentences, langModel):
        import pynlpl.lm.lm as pineApple
        arpaLM = pineApple.ARPALanguageModel(langModel)
        probab = []
        print("Using Pineapple")
        for sent in sentences:
            probab.append([arpaLM.score(sent)])

        return sparse.lil_matrix(probab)
Ejemplo n.º 3
0
    def backwardLangModelPOSFeat(self, argString, preprocessReq=0):
        '''
        Extracts n-gram POS language model preplexity features.
        '''
        ngramOrder = 3
        langModel = ""
        taggedInput = ""
        taggedCorpus = ""
        # TaggedInput1/0,LM0/1,taggedCorpus0/1,ngramOrder(,TaggedPOSfile(ifTaggedInp1),
        # LMFilePath(ifLM1),taggedCorpus(if LM0&TaggedCorpus1))
        arguments = argString.split(',')
        if int(arguments[0]):
            # Use file of tagged sents (last argument)
            taggedInput = "\"{0}\"".format(arguments[4])
        if int(arguments[1]):
            # Next argument
            langModel = "\"{0}\"".format(arguments[4 + int(arguments[0])])
        elif int(arguments[2]):
            taggedCorpus = "\"{0}\"".format(arguments[4 + int(arguments[0])])

        ngramOrder = int(arguments[3])

        if preprocessReq:
            # Request all preprocessing functions to be prepared
            if not taggedInput:
                taggedInput = self.prep_servs.dumpTokensTofile(
                    dumpFile="{0}_tagged_Input.txt".format(
                        self.preprocessor.getInputFileName()),
                    tokenSents=self.preprocessor.getPOStagged())
            if not langModel:
                if not taggedCorpus:
                    taggedCorpus = self.prep_servs.dumpTokensTofile(
                        dumpFile="{0}_tagged_Corpus.txt".format(
                            self.preprocessor.getCorpusLMName()),
                        tokenSents=self.prep_servs.tagPOSfromFile(
                            self.preprocessor.getCorpusLMName()))

                # If tagged corpus is empty, just use
                langModel = self.preprocessor.buildBackwardLanguageModel(
                    ngramOrder, taggedCorpus, False)

            return 1

        if not taggedInput:
            taggedInput = "{0}_tagged_Input.txt".format(
                self.preprocessor.getInputFileName())
        if not langModel:
            if not taggedCorpus:
                taggedCorpus = "{0}_tagged_Corpus.txt".format(
                    self.preprocessor.getCorpusLMName())
            langModel = self.preprocessor.buildBackwardLanguageModel(
                ngramOrder, taggedCorpus, False)

        srilmBinary, kenlm = self.preprocessor.getBinariesPath()

        if srilmBinary and not kenlm:
            pplFile = "tempLang{0}_reversed_{1}.ppl".format(
                os.path.basename(taggedInput[1:-1]), ngramOrder)
            # strip the quotes from the taggedInput filename
            # so that os.path.basename works correctly

            command = "\"{0}ngram\" -order {1} -lm {2} -ppl {3} -debug 1 -unk -reverse > {4}".format(
                srilmBinary, ngramOrder, langModel, taggedInput, pplFile)

            subprocess.call(command, shell=True)
            probab = self.extractValues(pplFile,
                                        self.preprocessor.getSentCount())
            os.remove(pplFile)
            return sparse.lil_matrix(probab)
        else:
            try:
                __import__('imp').find_module('kenlm')
                import kenlm
                model = kenlm.Model(langModel)
                probab = []
                for sent in self.preprocessor.getPOStagged():
                    probab.append([
                        model.score(sent, bos=True, eos=True),
                        model.perplexity(sent)
                    ])
                output = sparse.lil_matrix(probab)
                return output
            except ImportError:
                import pynlpl.lm.lm as pineApple
                arpaLM = pineApple.ARPALanguageModel(langModel)
                probab = []
                for sent in self.preprocessor.getPOStagged():
                    probab.append([arpaLM.score(sent)])
                output = sparse.lil_matrix(probab)
                return output
Ejemplo n.º 4
0
 def __init__(self, bigramARPAFilname: str):
     self.bigrams = lm.ARPALanguageModel(bigramARPAFilname, base_e=False)
Ejemplo n.º 5
0
# gather_captions_to_text(caption_dir, out_fpath)

# building  3-g LM
# https://cmusphinx.github.io/wiki/tutoriallm/#training-an-arpa-model-with-srilm

# $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram-count -kndiscount -interpolate -text ../clotho-dataset/lm/dev.txt -lm ../clotho-dataset/lm/dev.lm
# $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram -lm ../clotho-dataset/lm/dev.lm -ppl ../clotho-dataset/lm/eva.txt
# file ../clotho-dataset/lm/eva.txt: 5225 sentences, 64350 words, 0 OOVs
# 0 zeroprobs, logprob= -114470.2 ppl= 44.18532 ppl1= 60.09924

# $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram -lm ../clotho-dataset/lm/dev.lm -prune 1e-8 -write-lm ../clotho-dataset/lm/dev_pruned.lm
# $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram -lm ../clotho-dataset/lm/dev_pruned.lm -ppl ../clotho-dataset/lm/eva.txt
# file ../clotho-dataset/lm/eva.txt: 5225 sentences, 64350 words, 0 OOVs
# 0 zeroprobs, logprob= -114658.9 ppl= 44.46208 ppl1= 60.50634

# lm_path=b'../clotho-dataset/lm/dev.lm'
#
# lm = LM(lm_path, lower=False)
#
#
# print(len([b"man", b"a"]))
# print(lm.logprob_strings(lm, b"is", [b"man", b"a"]))

lm = lm.ARPALanguageModel('../clotho-dataset/lm/dev.lm')

print("man", lm.scoreword("man"))

print("a man", lm.scoreword("man", history=("a",)))

print("a hyman", lm.scoreword("hyman", history=("a",)))
Ejemplo n.º 6
0
    def __init__(self,
                 vocab_size,
                 embedding_dim=128,
                 decoder_hidden_size_1=128,
                 decoder_hidden_size_2=128,
                 query_size=128,
                 value_size=128,
                 key_size=128,
                 isAttended=False,
                 beam_size=10,
                 use_lm_bigram=False,
                 use_lm_trigram=False,
                 lm_weight=0.,
                 teacher_forcing_ratio=0.9,
                 word2index=None,
                 index2word=None,
                 vocab=None,
                 device='cpu'):

        super(BeamDecoder, self).__init__()

        print("BeamDecoder VOCAB SIZE:", vocab_size)

        self.beam_size = beam_size
        self.use_lm_bigram = use_lm_bigram
        self.use_lm_trigram = use_lm_trigram

        if use_lm_bigram or use_lm_trigram:
            self.my_lm = lm_object.ARPALanguageModel(lm_dir + 'dev_pruned.lm')
            # self.my_lm = lm_object.ARPALanguageModel(lm_dir + 'dev.lm')
            # self.my_lm = lm_object.ARPALanguageModel(lm_dir + 'dev_eva.lm')
            self.lm_w = lm_weight

        self.embedding = nn.Embedding(vocab_size,
                                      embedding_dim,
                                      padding_idx=word2index['<eos>'])

        self.lstm1 = nn.LSTMCell(input_size=embedding_dim + value_size,
                                 hidden_size=decoder_hidden_size_1)
        self.lstm2 = nn.LSTMCell(input_size=decoder_hidden_size_1,
                                 hidden_size=decoder_hidden_size_2)
        # self.decoder_hidden_size_1 = decoder_hidden_size_1
        # self.decoder_hidden_size_2 = decoder_hidden_size_2
        self.query_size = query_size

        assert query_size == value_size and query_size == key_size, "ERROR: decoder, query_size!=key_size or query_size!=value_size"

        self.query_network = nn.Linear(decoder_hidden_size_2, query_size)
        # print("hidden size", hidden_dim)
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.gumbel_noise_weight = 1.0
        # https://casmls.github.io/general/2017/02/01/GumbelSoftmax.html

        self.vocab_size = vocab_size
        self.word2index = word2index
        # print("word2index", word2index)
        self.index2word = index2word
        self.vocab = vocab

        self.DEVICE = device

        self.isAttended = isAttended

        if isAttended:
            self.attention = Attention()

        self.character_prob = nn.Linear(decoder_hidden_size_2 + query_size,
                                        vocab_size)