def __init__(self, wordlist='Brown', NLength=2, prec=50): ''' Inisiates the class, populating the frequancy data structures. @param wordlist: This is for which master corpus to use defaultis Brown @note: Posibal input - Brown AmE06 BE06 @param NLength: of Ngrams to calculate log for default 2 @param prec: the prossision of the log calculation ''' ng = NGram() getcontext().prec = prec if wordlist == 'Brown': self.fdist2 = nltk.FreqDist(nltk.corpus.brown.words()) self.fdist2Gram = nltk.FreqDist( ng.NGramUn(nltk.corpus.brown.raw(), n=NLength)[0]) elif wordlist == 'AmE06': self.fdist2 = nltk.FreqDist(AmE06().getCorpa().words()) self.fdist2Gram = nltk.FreqDist( ng.NGramUn(AmE06().getCorpa().raw(), n=NLength)[0]) elif wordlist == 'BE06': self.fdist2 = nltk.FreqDist(BE06().getCorpa().words()) self.fdist2Gram = nltk.FreqDist( ng.NGramUn(BE06().getCorpa().raw(), n=NLength)[0]) return
def retrieve_NGram(): configs = get_configs() if configs['corpus'] != 'rock': fname = 'bigram-bach.pkl' symbol_fname = 'rn2letter-bach.pkl' else: fname = 'bigram-rock.pkl' symbol_fname = 'rn2letter-rock.pkl' print '...retrieve_NGram fname', fname ngram = NGram(fname=fname) ngram.rn2letter, ngram.letter2rn, ngram.syms = \ update_syms(configs, symbol_fname, ngram.syms) print 'retrieve_NGram, # of syms', len(ngram.syms) return ngram
def gerarNGrams(self, n): self.__lista = l.DListaEncadeada() entrada = self.__texto.split() lista_termos = np.empty(len(entrada) - n + 1, dtype=object) for x in range(len(entrada) - n + 1): lista_termos[x] = entrada[x:x + n] for x in range(len(lista_termos)): self.__lista.anexar(NGram(lista_termos[x], n))
def do_ngram(): print("~~~running ngram extraction~~~") print("making samples", end=" ... ") sample_start = process_time() samples = [] length = 0 lmrnn = LanguageModel(rnn) while length < args.ngram_total_sample_length: s = lmrnn.sample(cutoff=args.ngram_max_sample_length) samples.append(s) length += (len(s) + 1) # ending the sequence is also a sample ngrams = {} ngrams_folder = rnn_folder + "/ngram" prepare_directory(ngrams_folder) sample_time = process_time() - sample_start print("done, that took:", clock_str(sample_start)) print("making the actual ngrams", end=" ... ") with open(ngrams_folder + "/samples.txt", "w") as f: print(len(samples), len(rnn.internal_alphabet), file=f) for s in samples: print(len(s), *s, file=f) for n in args.ngram_ns: ngram_start = process_time() ngram = NGram(n, rnn.input_alphabet, samples) ngram.creation_info = { "extraction time": sample_time + process_time() - ngram_start, "size": len(ngram._state_probs_dist), "n": n, "total samples len (including EOS)": length, "num samples": len(samples), "samples cutoff len": args.ngram_max_sample_length } overwrite_file(ngram, ngrams_folder + "/" + str(n)) ngrams[n] = ngram with open(ngrams_folder + "/creation_infos.txt", "w") as f: print("ngrams made from", len(samples), "samples, of total length", length, "(including EOSs)", file=f) for n in ngrams: print("===", n, "===\n", ngrams[n].creation_info, "\n\n", file=f) print("done, that took overall", clock_str(sample_start)) return ngrams
def __init__(self, llwl='Brown', llNL=2, percen=80, NE=True, Col=True, Gram=True, Chu=True): ''' @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06') @param llNL:LogLikleyHood @param percen: Presision of output default = 20, 20% returned @param NE: Uses NE default True @param Col: Uses Collocation default True @param Gram: Uses N-Grams default True @param Chu: Uses Chunking default True ''' self.NEs = NE self.Col = Col self.Gram = Gram self.Chu = Chu self.p = percen print 'Starting to build ', llwl self.LL = LogLikelihood(wordlist=llwl, NLength=llNL) print 'LL Loaded' self.POS = POS() print 'POS Loaded' self.GD = GetData() print 'GD Loaded' self.Cu = Chunker(self.POS) print 'Cu Loaded' self.FL = Filter() print 'FL Loaded' self.CC = Collocation(self.POS) print 'CC Loaded' self.Ng = NGram() print 'Ng Loaded' self.S = Select(percentil=self.p) print 'S Loaded' self.To = Tokenize(self.FL) print 'To Loaded'
def gerarNGrams(self, n): ''' Gera os ngrams do documento no tamanho do parametro n, com base na técnica da janela deslizante''' self.__lista = l.DListaEncadeada() entrada = self.__texto #busca o range até o limite da lista de palavras -n+1 para não ficar com a formatação errada nos ngram for x in range(len(entrada)-n+1): #recebe o indice que se refere ao ngram, ao invés de armazenar palavras indice = (x,x+n) #referencia do documento que gerou o ngram doc = self #lista duplamente encadeada que armazena todos os ngram deste documento self.__lista.anexar(NGram(indice,doc,n))
def main(): training_set = TrainingSetHandler() training_set.load_training_set() gram_list = [] for size in Constants.SIZE_OF_GRAMS: for language in training_set.language_list: gram = NGram(size, string.ascii_lowercase, 0.5) gram.train(training_set.training_set[language], language) gram_list.append(gram) #dump copies of grams to file dump_grams(gram_list) predic = Predictor(gram_list) test_set_handler = TestSetHandler() test_set_handler.load_test_sentence() for idx, sentence in enumerate(test_set_handler.test_set): clean_sentence = "".join([c for c in sentence[1] if c.isalpha()]).lower() prediction = predic.predict_this_sentence(clean_sentence) with open(os.path.join(Constants.OUTPUT_PATH, "out{}.txt".format(idx)), 'w') as f: output = OutputHelper(prediction, sentence, f) output.print_and_save_output()
def run_model(trainingdata, N, strings, smoothing=False): ''' Simple helper function to run each model configuration ''' model = NGram(N, trainingdata, smoothing) print(model.get_model_formatted()) sys.stdout.write(os.linesep) print("Sentence Probability Evaluation") for s in strings: print("Log(p) = %08.4f".ljust(20) % model.sentence_probability(s) + "Sentence: <s> %s </s>" % s) sys.stdout.write(os.linesep) if smoothing: print("Generated Sentences") for s in model.generate_sentence(10): print("Log(p) = %08.4f".ljust(20) % model.sentence_probability(s) + "Sentence: %s" % s) sys.stdout.write(os.linesep)
def get_models(data=None, configs=None, save=False): if configs is None: configs = get_configs() if data is None: data = get_data(configs) # TODO: remove hack if configs["bigram"]: reduced_keys = [configs["reduced_key"]] data.keys = reduced_keys test_data = data.get_test_data() test_data.keys = reduced_keys retrieve_model = configs["retrieve_model"] model = SkipGramNN(data, configs) print "SkipGramNN, # of syms", len(model.syms) if not retrieve_model: model_loss = model.train() if save:"skipgram-%s.pkl" % (configs["corpus"])) plt.clf() plt.plot(model.loss_curve) plt.savefig("losses-%s.png" % print "=== train loss ===" print "loss: %.2f" % model_loss loss = model.check_loss() if not configs["regularize"]: assert np.allclose(loss, model_loss) if save: model_weights = model.weights.value fname = "w1-%s.pkl" % print fname with open(fname, "wb") as p: pickle.dump(model.W1.value, p) pickle.dump(data.syms, p) pickle.dump(model_loss, p) pickle.dump(model_weights, p) pickle.dump(configs, p) fname = "skipgram-bach.pkl" else: fname = os.path.join("data", "test_skipgram_model.pkl") print fname assert fname is not None, "Error: no model to retrieve in the time being" with open(fname, "rb") as p: w1 = pickle.load(p) syms = pickle.load(p) model_loss = pickle.load(p) model_weights = pickle.load(p) configs_reloaded = pickle.load(p) for key in configs.keys(): if key not in configs_reloaded.keys(): print "no key", key for key in configs.keys(): if key in configs_reloaded.keys(): if configs[key] != configs_reloaded[key]: print configs[key], configs_reloaded[key] # assert configs == configs_reloaded model.init_weights(model_weights, model_loss) train_seq_data = data.get_train_seqs_data() train_seqs = [seq for seq in train_seq_data.seqs] syms = data.syms # ngram_model = NGram(train_seqs, syms, 2, configs) ngram_model = NGram(data.seqs, syms, 2, configs) print "\n\nNgram, # of syms", len(ngram_model.syms) if save:"bigram-%s.pkl" % (configs["corpus"])) print len(ngram_model.syms), len( assert ngram_model.syms == return model, ngram_model
class runable(object): ''' Class for selecting keywords and extracting keywords from online contentent. ''' def __init__(self, llwl='Brown', llNL=2, percen=80, NE=True, Col=True, Gram=True, Chu=True): ''' @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06') @param llNL:LogLikleyHood @param percen: Presision of output default = 20, 20% returned @param NE: Uses NE default True @param Col: Uses Collocation default True @param Gram: Uses N-Grams default True @param Chu: Uses Chunking default True ''' self.NEs = NE self.Col = Col self.Gram = Gram self.Chu = Chu self.p = percen print 'Starting to build ', llwl self.LL = LogLikelihood(wordlist=llwl, NLength=llNL) print 'LL Loaded' self.POS = POS() print 'POS Loaded' self.GD = GetData() print 'GD Loaded' self.Cu = Chunker(self.POS) print 'Cu Loaded' self.FL = Filter() print 'FL Loaded' self.CC = Collocation(self.POS) print 'CC Loaded' self.Ng = NGram() print 'Ng Loaded' self.S = Select(percentil=self.p) print 'S Loaded' self.To = Tokenize(self.FL) print 'To Loaded' def Select(self, url, depth): ''' Determin the best keywords for a webpage. @param url: the base url to start sampaling from @param depth: the depth of the website to be sampled @return: the list of selected keywords, ordered with the highest rated words to the lower bownd of array. ''' #Get data from web page text = self.GD.getWebPage(url, depth) #Tokonize sentance and words tok = self.To.Tok(text) #POS tag the text pos = self.POS.POSTag(tok, 'tok') #Log Likly Hood log = self.LL.calcualte(tok) #Collocations if self.Col == True: col = self.CC.col(pos, tok) else: col = [] #NE Extraction if self.NEs == True: ne = self.Cu.Chunks(pos, nodes=['PERSON', 'ORGANIZATION', 'LOCATION']) else: ne = [] #Extract NP if self.Chu == True: chu = [self.Cu.parse(p) for p in pos] else: chu = [] #Creat N-gram if self.Gram == True: ga = self.Ng.Grams(pos, n=6) else: ga = [] return self.S.keywords(ne, ga, col, chu, log)
sys.exit() if sys.argv[7] not in ['0', '1']: print( 'Invalid BYOM value. Enter 1 for True or 0 for False, terminating program...' ) sys.exit() # Main Method if __name__ == "__main__": verifyArgs() # Build and test with BYOM code, if sys.argv[7] == '1': byom = BYOM(sys.argv[1], float(sys.argv[3]), sys.argv[4], sys.argv[5], bool(int(sys.argv[6]))) byom.initialize() evalBYOM.evalBYOM(byom) # Otherwise build and test Ngram model. else: ngram = NGram(sys.argv[1], int(sys.argv[2]), float(sys.argv[3]), sys.argv[4], sys.argv[5], bool(int(sys.argv[6]))) ngram.initialize() evaluate.evaluate(ngram)