Example #1
0
    def __init__(self, wordlist='Brown', NLength=2, prec=50):
        '''
        Inisiates the class, populating the frequancy data structures.
        @param wordlist: This is for which master corpus to use defaultis Brown
        @note: Posibal input - 
                Brown
                AmE06
                BE06
        @param NLength: of Ngrams to calculate log for default 2
        @param prec: the prossision of the log calculation 
        '''
        ng = NGram()
        getcontext().prec = prec

        if wordlist == 'Brown':
            self.fdist2 = nltk.FreqDist(nltk.corpus.brown.words())
            self.fdist2Gram = nltk.FreqDist(
                ng.NGramUn(nltk.corpus.brown.raw(), n=NLength)[0])
        elif wordlist == 'AmE06':
            self.fdist2 = nltk.FreqDist(AmE06().getCorpa().words())
            self.fdist2Gram = nltk.FreqDist(
                ng.NGramUn(AmE06().getCorpa().raw(), n=NLength)[0])
        elif wordlist == 'BE06':
            self.fdist2 = nltk.FreqDist(BE06().getCorpa().words())
            self.fdist2Gram = nltk.FreqDist(
                ng.NGramUn(BE06().getCorpa().raw(), n=NLength)[0])
        return
def retrieve_NGram():
    configs = get_configs()
    if configs['corpus'] != 'rock':
        fname = 'bigram-bach.pkl'
        symbol_fname = 'rn2letter-bach.pkl'
    else:
        fname = 'bigram-rock.pkl'
        symbol_fname = 'rn2letter-rock.pkl'
    print '...retrieve_NGram fname', fname
    ngram = NGram(fname=fname)

    ngram.rn2letter, ngram.letter2rn, ngram.syms = \
        update_syms(configs, symbol_fname, ngram.syms)

    print 'retrieve_NGram, # of syms', len(ngram.syms)

    return ngram
Example #3
0
 def gerarNGrams(self, n):
     self.__lista = l.DListaEncadeada()
     entrada = self.__texto.split()
     lista_termos = np.empty(len(entrada) - n + 1, dtype=object)
     for x in range(len(entrada) - n + 1):
         lista_termos[x] = entrada[x:x + n]
     for x in range(len(lista_termos)):
         self.__lista.anexar(NGram(lista_termos[x], n))
def do_ngram():
    print("~~~running ngram extraction~~~")
    print("making samples", end=" ... ")
    sample_start = process_time()
    samples = []
    length = 0
    lmrnn = LanguageModel(rnn)
    while length < args.ngram_total_sample_length:
        s = lmrnn.sample(cutoff=args.ngram_max_sample_length)
        samples.append(s)
        length += (len(s) + 1)  # ending the sequence is also a sample
    ngrams = {}
    ngrams_folder = rnn_folder + "/ngram"
    prepare_directory(ngrams_folder)
    sample_time = process_time() - sample_start
    print("done, that took:", clock_str(sample_start))
    print("making the actual ngrams", end=" ... ")
    with open(ngrams_folder + "/samples.txt", "w") as f:
        print(len(samples), len(rnn.internal_alphabet), file=f)
        for s in samples:
            print(len(s), *s, file=f)
    for n in args.ngram_ns:
        ngram_start = process_time()
        ngram = NGram(n, rnn.input_alphabet, samples)
        ngram.creation_info = {
            "extraction time": sample_time + process_time() - ngram_start,
            "size": len(ngram._state_probs_dist),
            "n": n,
            "total samples len (including EOS)": length,
            "num samples": len(samples),
            "samples cutoff len": args.ngram_max_sample_length
        }
        overwrite_file(ngram, ngrams_folder + "/" + str(n))
        ngrams[n] = ngram
    with open(ngrams_folder + "/creation_infos.txt", "w") as f:
        print("ngrams made from",
              len(samples),
              "samples, of total length",
              length,
              "(including EOSs)",
              file=f)
        for n in ngrams:
            print("===", n, "===\n", ngrams[n].creation_info, "\n\n", file=f)
    print("done, that took overall", clock_str(sample_start))
    return ngrams
Example #5
0
    def __init__(self,
                 llwl='Brown',
                 llNL=2,
                 percen=80,
                 NE=True,
                 Col=True,
                 Gram=True,
                 Chu=True):
        '''      
        @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06')
        @param llNL:LogLikleyHood 
        @param percen: Presision of output default = 20, 20% returned
        @param NE: Uses NE default True 
        @param Col: Uses Collocation default True
        @param Gram: Uses N-Grams default True
        @param Chu: Uses Chunking default True
        '''

        self.NEs = NE
        self.Col = Col
        self.Gram = Gram
        self.Chu = Chu
        self.p = percen
        print 'Starting to build ', llwl
        self.LL = LogLikelihood(wordlist=llwl, NLength=llNL)
        print 'LL Loaded'
        self.POS = POS()
        print 'POS Loaded'
        self.GD = GetData()
        print 'GD Loaded'
        self.Cu = Chunker(self.POS)
        print 'Cu Loaded'
        self.FL = Filter()
        print 'FL Loaded'
        self.CC = Collocation(self.POS)
        print 'CC Loaded'
        self.Ng = NGram()
        print 'Ng Loaded'
        self.S = Select(percentil=self.p)
        print 'S Loaded'
        self.To = Tokenize(self.FL)
        print 'To Loaded'
Example #6
0
 def gerarNGrams(self, n):
         ''' Gera os ngrams do documento no tamanho do parametro n, com base na técnica da janela deslizante'''
         self.__lista = l.DListaEncadeada()
         entrada = self.__texto
         #busca o range até o limite da lista de palavras -n+1 para não ficar com a formatação errada nos ngram
         for x in range(len(entrada)-n+1):
             #recebe o indice que se refere ao ngram, ao invés de armazenar palavras
             indice = (x,x+n)
             #referencia do documento que gerou o ngram
             doc = self
             #lista duplamente encadeada que armazena todos os ngram deste documento
             self.__lista.anexar(NGram(indice,doc,n))
Example #7
0
def main():
    training_set = TrainingSetHandler()
    training_set.load_training_set()
    gram_list = []

    for size in Constants.SIZE_OF_GRAMS:
        for language in training_set.language_list:
            gram = NGram(size, string.ascii_lowercase, 0.5)
            gram.train(training_set.training_set[language], language)
            gram_list.append(gram)

    #dump copies of grams to file
    dump_grams(gram_list)

    predic = Predictor(gram_list)
    test_set_handler = TestSetHandler()
    test_set_handler.load_test_sentence()
    for idx, sentence in enumerate(test_set_handler.test_set):
        clean_sentence = "".join([c for c in sentence[1] if c.isalpha()]).lower()
        prediction = predic.predict_this_sentence(clean_sentence)
        with open(os.path.join(Constants.OUTPUT_PATH, "out{}.txt".format(idx)), 'w') as f:
            output = OutputHelper(prediction, sentence, f)
            output.print_and_save_output()
Example #8
0
def run_model(trainingdata, N, strings, smoothing=False):
    '''
    Simple helper function to run each model configuration
    '''
    trainingdata.seek(0)
    model = NGram(N, trainingdata, smoothing)
    print(model.get_model_formatted())
    sys.stdout.write(os.linesep)
    print("Sentence Probability Evaluation")
    for s in strings:
        print("Log(p) = %08.4f".ljust(20) % model.sentence_probability(s) +
              "Sentence: <s> %s </s>" % s)
    sys.stdout.write(os.linesep)
    if smoothing:
        print("Generated Sentences")
        for s in model.generate_sentence(10):
            print("Log(p) = %08.4f".ljust(20) % model.sentence_probability(s) +
                  "Sentence: %s" % s)
        sys.stdout.write(os.linesep)
def get_models(data=None, configs=None, save=False):
    if configs is None:
        configs = get_configs()

    if data is None:
        data = get_data(configs)

    # TODO: remove hack
    if configs["bigram"]:
        reduced_keys = [configs["reduced_key"]]
        data.keys = reduced_keys
        test_data = data.get_test_data()
        test_data.keys = reduced_keys

    retrieve_model = configs["retrieve_model"]
    model = SkipGramNN(data, configs)
    print "SkipGramNN, # of syms", len(model.syms)

    if not retrieve_model:
        model_loss = model.train()
        if save:
            model.save("skipgram-%s.pkl" % (configs["corpus"]))
            plt.clf()
            plt.plot(model.loss_curve)
            plt.savefig("losses-%s.png" % configs.name)
        print "=== train loss ==="
        print "loss: %.2f" % model_loss
        loss = model.check_loss()
        if not configs["regularize"]:
            assert np.allclose(loss, model_loss)

        if save:
            model_weights = model.weights.value
            fname = "w1-%s.pkl" % configs.name
            print fname
            with open(fname, "wb") as p:
                pickle.dump(model.W1.value, p)
                pickle.dump(data.syms, p)
                pickle.dump(model_loss, p)
                pickle.dump(model_weights, p)
                pickle.dump(configs, p)

            fname = "skipgram-bach.pkl"
            model.save(fname)
    else:
        fname = os.path.join("data", "test_skipgram_model.pkl")
        print fname
        assert fname is not None, "Error: no model to retrieve in the time being"
        with open(fname, "rb") as p:
            w1 = pickle.load(p)
            syms = pickle.load(p)
            model_loss = pickle.load(p)
            model_weights = pickle.load(p)
            configs_reloaded = pickle.load(p)
        for key in configs.keys():
            if key not in configs_reloaded.keys():
                print "no key", key
        for key in configs.keys():
            if key in configs_reloaded.keys():
                if configs[key] != configs_reloaded[key]:
                    print configs[key], configs_reloaded[key]

        # assert configs == configs_reloaded
        model.init_weights(model_weights, model_loss)

    train_seq_data = data.get_train_seqs_data()
    train_seqs = [seq for seq in train_seq_data.seqs]
    syms = data.syms

    # ngram_model = NGram(train_seqs, syms, 2, configs)
    ngram_model = NGram(data.seqs, syms, 2, configs)
    print "\n\nNgram, # of syms", len(ngram_model.syms)
    if save:
        ngram_model.save("bigram-%s.pkl" % (configs["corpus"]))
    print len(ngram_model.syms), len(model.data.syms)
    assert ngram_model.syms == model.data.syms

    return model, ngram_model
Example #10
0
class runable(object):
    '''
    Class for selecting keywords and extracting keywords from online contentent.
    '''
    def __init__(self,
                 llwl='Brown',
                 llNL=2,
                 percen=80,
                 NE=True,
                 Col=True,
                 Gram=True,
                 Chu=True):
        '''      
        @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06')
        @param llNL:LogLikleyHood 
        @param percen: Presision of output default = 20, 20% returned
        @param NE: Uses NE default True 
        @param Col: Uses Collocation default True
        @param Gram: Uses N-Grams default True
        @param Chu: Uses Chunking default True
        '''

        self.NEs = NE
        self.Col = Col
        self.Gram = Gram
        self.Chu = Chu
        self.p = percen
        print 'Starting to build ', llwl
        self.LL = LogLikelihood(wordlist=llwl, NLength=llNL)
        print 'LL Loaded'
        self.POS = POS()
        print 'POS Loaded'
        self.GD = GetData()
        print 'GD Loaded'
        self.Cu = Chunker(self.POS)
        print 'Cu Loaded'
        self.FL = Filter()
        print 'FL Loaded'
        self.CC = Collocation(self.POS)
        print 'CC Loaded'
        self.Ng = NGram()
        print 'Ng Loaded'
        self.S = Select(percentil=self.p)
        print 'S Loaded'
        self.To = Tokenize(self.FL)
        print 'To Loaded'

    def Select(self, url, depth):
        '''
        Determin the best keywords for a webpage.
        
        @param url: the base url to start sampaling from
        @param depth: the depth of the website to be sampled
        
        @return: the list of selected keywords, ordered with the highest rated words to the lower bownd of array.
        '''
        #Get data from web page
        text = self.GD.getWebPage(url, depth)

        #Tokonize sentance and words
        tok = self.To.Tok(text)

        #POS tag the text
        pos = self.POS.POSTag(tok, 'tok')

        #Log Likly Hood
        log = self.LL.calcualte(tok)

        #Collocations
        if self.Col == True:
            col = self.CC.col(pos, tok)
        else:
            col = []

        #NE Extraction
        if self.NEs == True:
            ne = self.Cu.Chunks(pos,
                                nodes=['PERSON', 'ORGANIZATION', 'LOCATION'])
        else:
            ne = []

        #Extract NP
        if self.Chu == True:
            chu = [self.Cu.parse(p) for p in pos]
        else:
            chu = []

        #Creat N-gram
        if self.Gram == True:
            ga = self.Ng.Grams(pos, n=6)
        else:
            ga = []

        return self.S.keywords(ne, ga, col, chu, log)
        sys.exit()

    if sys.argv[7] not in ['0', '1']:
        print(
            'Invalid BYOM value. Enter 1 for True or 0 for False, terminating program...'
        )
        sys.exit()


# Main Method
if __name__ == "__main__":
    verifyArgs()
    # Build and test with BYOM code,
    if sys.argv[7] == '1':
        byom = BYOM(sys.argv[1], float(sys.argv[3]), sys.argv[4], sys.argv[5],
                    bool(int(sys.argv[6])))

        byom.initialize()

        evalBYOM.evalBYOM(byom)

    # Otherwise build and test Ngram model.
    else:

        ngram = NGram(sys.argv[1], int(sys.argv[2]), float(sys.argv[3]),
                      sys.argv[4], sys.argv[5], bool(int(sys.argv[6])))

        ngram.initialize()

        evaluate.evaluate(ngram)