def gen_disassociated_press(file=KJBIBLE, order=3, len=100): """Generate some autocorrelated text.""" tokens = [k for k in tokenize(file) if k.isalpha()] model = NgramModel(order, tokens, MLEProbDist) ret = ['',] * (order-1) for i in range(len): tail = ret[-(order-1):] ret.append(model.generate_one(tail)) return ' '.join(ret[(order-1):])
def train_ngram(self, target_data_path, days, kakao_data_path): results = [] ngramModel = NgramModel(self.dataLoader) self.dataLoader.ngram_data_loader(days) self.dataLoader.kakao_data_loader(kakao_data_path) target_datas = self.dataLoader.target_data_loader(target_data_path) target_data_len = len(target_datas) for i, target_data in enumerate(target_datas): if i % (target_data_len / 100) == 0: print("{}% 완료".format((i / target_data_len) * 100)) re = ngramModel.detect_rule_recommend(target_data) results.append(target_data + " " + " ".join(re)) self.dataLoader.write_result(self.write_file_path, results)
def train(train_file): """Return the required language models trained from a file.""" unigram = NgramModel(1) bigram_left = NgramModel(2) bigram_right = NgramModel(2) for line in train_file: tokens = line.rstrip().split() unigram.update(tokens) bigram_left.update(tokens) bigram_right.update(reversed(tokens)) return (unigram, bigram_left, bigram_right)
def _make_models(self, tuples): self._word_ids = WordIdDictionary() # Extract sequence of words, lemmas, and tags words, lemmas, tags = tuple(map(lambda tokens: list( self._word_ids.add_words_transform(tokens)), zip(*tuples))) self._tags = tags # Create models for words, lemmas, and tags self._words_ngram = NgramModel(words, self._n) self._lemmas_ngram = NgramModel(lemmas, self._n) self._tags_ngram = NgramModel(tags, 2 * self._n) # Can afford to use 2 * n-gram size for grammar # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively # It's faster to use a list than predicate on unigrams during backoff search self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas)) self._tag_lemma_words = ConditionalFreqDist( zip(zip(tags, lemmas), words))
def main(): start = time.clock() #sys.argv[1] is path to training data #sys.argv[2] is length of n-grams ngram_model = NgramModel(int(sys.argv[2]), sys.argv[1], pad_right=True) end = time.clock() print 'Done computing ngram model, ' + str(end - start) + ' seconds running time' fileName = sys.argv[2] + 'gramModel' + '_' + sys.argv[1][3:].split('.')[0] + '.p' pickle.dump(ngram_model, open(fileName, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) end = time.clock() print 'Done pickling, ' + str(end - start) + ' seconds running time' ''' #for unpickling fileName = sys.argv[2] + 'gramModel' + '_' + sys.argv[1][3:].split('.')[0] + '.p' restored_model = pickle.load(open(fileName, 'rb')) #end = time.clock() #print 'Done unpickling, ' + str(end - start) + ' seconds running time' ''' print "Generated examples: " for i in range (200): #print ' '.join(ngram_model.generate(20, ('', ''))) review = [] context = ['', ''] nextToken = ngram_model._generate_one(context) while nextToken != '.' and nextToken != '...EOR...' and len(review) < 500: review.append(nextToken) context[0] = context[1] context[1] = nextToken nextToken = ngram_model._generate_one(context) print ' '.join(review) + ' len: ' + str(len(review))
def initialize_bot(chars, nicks): n = 3 # intros = ["So", "Hi", "In fact", "For what it's worth", "Think about it", # "Conversely", "On the other hand", "Debatably", "Especially", "Not to mention", "Although", "Moreover", "Equally", # "But", "Yes", # "See here", "Ultimately", "Rather", "Nevertheless", "As you said", "Mind you", "Even so"] char_corps = load_corpora(chars) est = lambda fdist, bins: MLEProbDist(fdist) # est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) # est = lambda fdist, bins: WittenBellProbDist(fdist) # est = lambda fdist, bins: KneserNeyProbDist(fdist) models = {character: NgramModel(n, corp, estimator=est) for character, corp in char_corps.iteritems()} # return ChatBot(chars, nicks, intros, models, ngram=n, debug=False) return ChatBot(chars, nicks, models, ngram=n, debug=False)
def viterbi(self,sentence, order): tokenizer = RegexpTokenizer(r'[\w\']+') self.token_words = tokenizer.tokenize(sentence) #self.token_words = word_tokenize(sentence) self.nngram = NgramModel(order, [ word.lower() for word in brown.words()], estimator) self.N = len(self.token_words) MAX_OFFSET = 10 if order == 2: viterbiM = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = 'double') words = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = object) backpointer = np.zeros( ( self.N + MAX_OFFSET, self.M + 2), dtype = 'int32') offset = np.zeros( self.N, dtype = 'int32' ) return self.viterbi_first_order(viterbiM, words, backpointer, offset) if order == 3: viterbiM = np.zeros((self.N + MAX_OFFSET, self.M + 2, self.M + 2), dtype = 'double') words = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = object) backpointer = np.zeros( ( self.N + MAX_OFFSET, self.M + 2, self.M + 2), dtype = 'int32') offset = np.zeros( self.N, dtype = 'int32' ) return self.viterbi_second_order(viterbiM, words, backpointer, offset)
def _make_models(self, tuples): self._word_ids = WordIdDictionary() # Extract sequence of words, lemmas, and tags words, lemmas, tags = tuple( map( lambda tokens: list(self._word_ids.add_words_transform(tokens) ), zip(*tuples))) self._tags = tags # Create models for words, lemmas, and tags self._words_ngram = NgramModel(words, self._n) self._lemmas_ngram = NgramModel(lemmas, self._n) self._tags_ngram = NgramModel( tags, 2 * self._n) # Can afford to use 2 * n-gram size for grammar # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively # It's faster to use a list than predicate on unigrams during backoff search self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas)) self._tag_lemma_words = ConditionalFreqDist( zip(zip(tags, lemmas), words))
if __name__ == "__main__": if len(sys.argv) < 2: print "Usage: %s <corpus-root> <tweets-file>" % (sys.argv[0]) sys.exit(1) corpus_root = sys.argv[1] estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) ignored_words = nltk.corpus.stopwords.words('english') pos_movie_reviews = PlaintextCorpusReader(corpus_root + "/pos", ".*\.txt") neg_movie_reviews = PlaintextCorpusReader(corpus_root + "/neg", ".*\.txt") print "Corpora built." pos_unigram_lm = NgramModel(1, pos_movie_reviews.words(), estimator) print "Positive unigram model complete." pos_bigram_lm = NgramModel(2, pos_movie_reviews.words(), estimator) print "Positive bigram model complete." #pos_trigram_lm = NgramModel(3, pos_movie_reviews.words(), estimator) neg_unigram_lm = NgramModel(1, neg_movie_reviews.words(), estimator) print "Negative unigram model complete." neg_bigram_lm = NgramModel(2, neg_movie_reviews.words(), estimator) print "Negative bigram model complete." #neg_trigram_lm = NgramModel(3, neg_movie_reviews.words(), estimator) #read in the tweets tweets = [] tokenizer = utils.Tokenizer()
class HMM(object): def __init__(self, matrixE): self.matrixE = matrixE self.M = 20 self.nngram = None self.word_dict_path = '../data/word_by_len' self.word_dict = {} self.load_word_dict() self.ppservers=("localhost", ) self.threshold = 0.00000001 self.job_server = pp.Server(ppservers= self.ppservers) print "active nodes: ", self.job_server.get_active_nodes() #self.trigram = NgramModel(3, brown.words(), estimator) def load_word_dict(self): for i in xrange(1,24): with open("%s/%d.txt"%(self.word_dict_path, i), 'r') as fd: self.word_dict[i] = [line.strip().lower() for line in fd.readlines()] def segment(self, word): prob_max = 0.0 first, second = [],[] special_char = [',','.','\'', ' ', '\n'] for c in xrange(1,len(word)-1): first_cur = self.most_similar_words(word[:c]) second_cur = self.most_similar_words(word[(c+1):]) for (key1, val1) in first_cur: for (key2, val2) in second_cur: prob = val1 * val2 * max( [ self.matrixE[charToNum(sc)][charToNum(word[c])] for sc in special_char ]) if prob > prob_max: prob_max = prob LOG( log_file, '(%s, %d, %f, %s, %s)\n' %(word, c, prob_max, key1, key2)) first = first_cur second = second_cur return first, second, prob_max def viterbi(self,sentence, order): tokenizer = RegexpTokenizer(r'[\w\']+') self.token_words = tokenizer.tokenize(sentence) #self.token_words = word_tokenize(sentence) self.nngram = NgramModel(order, [ word.lower() for word in brown.words()], estimator) self.N = len(self.token_words) MAX_OFFSET = 10 if order == 2: viterbiM = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = 'double') words = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = object) backpointer = np.zeros( ( self.N + MAX_OFFSET, self.M + 2), dtype = 'int32') offset = np.zeros( self.N, dtype = 'int32' ) return self.viterbi_first_order(viterbiM, words, backpointer, offset) if order == 3: viterbiM = np.zeros((self.N + MAX_OFFSET, self.M + 2, self.M + 2), dtype = 'double') words = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = object) backpointer = np.zeros( ( self.N + MAX_OFFSET, self.M + 2, self.M + 2), dtype = 'int32') offset = np.zeros( self.N, dtype = 'int32' ) return self.viterbi_second_order(viterbiM, words, backpointer, offset) def most_similar_words_with_split(self,word): states = self.most_similar_words(word) # If the probability is too small , it's possible that space is recognized as character states_tmp = [] if states[0][1] < self.threshold: LOG(log_file, "Beyond bottom threshold, try to split %s\n"%word) states1, states2, prob = self.segment(word) if prob > states[0][1]: states_tmp.append(states1) states_tmp.append(states2) splited = True else: states_tmp.append(states) else: states_tmp.append(states) return states_tmp def viterbi_first_order(self,viterbiM, words, backpointer, offset): print 'call_bigram' for i, word in enumerate(self.token_words): # Find matched word by probability splited = False starttime = int(time()) print 'finding similar words(%s)'%word states_tmp = self.most_similar_words_with_split(word) print states_tmp find_time = int(time()) - starttime # initial current offset cur_offset = 0 if i != 0: cur_offset = offset[i-1] # recursion step print 'iterate states' for index, st in enumerate(states_tmp): id_with_offset = cur_offset + index + i for j, (state, prob) in enumerate(st): # print something out for debuging if j < 10: LOG(log_file, '%s, %f\n'%( state, prob )) print state,prob words[i][j+1] = state if i == 0: pref = [u' '] viterbiM[id_with_offset][j+1] = self.nngram.prob(state, pref)*prob backpointer[id_with_offset][j+1] = 0 else: l_tmp = max(enumerate([ viterbiM[id_with_offset - 1 ][k+1] *self.nngram.prob(state, [ str(words[id_with_offset - 1 ][k+1]) ]) * prob for k in xrange(self.M) ]), key = operator.itemgetter(1) ) backpointer[id_with_offset][j+1] , viterbiM[id_with_offset][j+1] = l_tmp if splited: offset[i] = cur_offset + 1 LOG( log_file, "Eclapse %d s matching most possible word (%s)," "eclapse %d s for viterbi...\n" %(find_time, word , int(time()) - starttime - find_time)) final_offset = offset[-1] print 'final offset is %d'%final_offset # termination step l = [ viterbiM[self.N + final_offset - 1][k+1] *self.endOfSentence(self.nngram, [ words[self.N + final_offset - 1][k+1] ]) for k in xrange(self.M) ] backpointer[self.N + final_offset - 1][self.M+1], viterbiM[self.N + final_offset - 1][self.M + 1] = max(enumerate(l), key = operator.itemgetter(1)) # backtrace path=[] end = backpointer[self.N + final_offset - 1][self.M+1] for i in xrange(self.N + final_offset - 1, 0, -1): path.append(end) end = backpointer[i][end+1] path.append(end) word_vector = [] for i in xrange(self.N + final_offset -1, -1, -1): word_vector.append(words[self.N + final_offset -1 - i][path[i]+1]) return word_vector def viterbi_second_order(self,viterbiM, words, backpointer, offset): for i, word in enumerate(self.token_words): # Find matched word by probability starttime = int(time()) splited = False states_tmp = self.most_similar_words_with_split(word) find_time = int(time()) - starttime # initial current offset cur_offset = 0 if i != 0: cur_offset = offset[i-1] # recursion step for index, st in enumerate(states_tmp): id_with_offset = cur_offset + index + i for j, (state, prob) in enumerate(st): # print something out for debuging if j < 20: LOG(log_file, '%s, %f\n'%( state, prob )) print state,prob words[i][j+1] = state for l in xrange(self.M): if i == 0: pref = [u' '] viterbiM[id_with_offset][l+1][j+1] = self.nngram.prob(state, pref)*prob backpointer[id_with_offset][l+1][j+1] = 0 elif i == 1: backpointer[id_with_offset][l+1][j+1] , viterbiM[id_with_offset][l+1][j+1] = max(enumerate([ viterbiM[id_with_offset - 1 ][k+1][l+1] *self.nngram.prob(state, [ ' ', str(words[id_with_offset - 1][l+1]) ]) * prob for k in xrange(self.M) ]), key = operator.itemgetter(1) ) else: backpointer[id_with_offset][l+1][j+1] , viterbiM[id_with_offset][l+1][j+1] = max(enumerate([ viterbiM[id_with_offset - 1 ][k+1][l+1] *self.nngram.prob(state, [ str(words[id_with_offset - 2 ][k+1]), str(words[id_with_offset - 1][l+1]) ]) * prob for k in xrange(self.M) ]), key = operator.itemgetter(1) ) if splited: offset[i] = cur_offset + 1 LOG( log_file, "Eclapse %d s matching most possible word (%s)," "eclapse %d s for viterbi...\n" %(find_time, word , int(time()) - starttime - find_time)) final_offset = offset[-1] print 'final offset is %d'%final_offset # termination step for l in xrange(self.M): backpointer[self.N + final_offset - 1][l+1][self.M+1],viterbiM[self.N + final_offset - 1][l + 1][self.M+1] = max(enumerate([ viterbiM[self.N + final_offset - 1][k+1][l+1] *self.endOfSentence(self.nngram, [ str(words[self.N + final_offset - 2][k+1]) , str(words[self.N + final_offset - 1][l+1]) ]) for k in xrange(self.M) ]), key = operator.itemgetter(1)) backpointer[self.N + final_offset - 1][self.M+1][self.M+1],viterbiM[self.N + final_offset - 1][self.M+1][self.M+1] = max(enumerate([ viterbiM[self.N + final_offset - 1][k+1][self.M+1] *self.endOfSentence(self.nngram, [ str(words[self.N + final_offset - 1][k+1]) ]) for k in xrange(self.M) ]), key = operator.itemgetter(1)) # backtrace import pdb pdb.set_trace() path=[] end = backpointer[self.N + final_offset - 1][self.M+1][self.M+1] path.append(end) last_end = backpointer[self.N + final_offset - 1][end][self.M+1] path.append(last_end) for i in xrange(self.N + final_offset - 1, 1, -2): end = backpointer[i][last_end+1][end + 1] last_end = backpointer[i-1][end + 1][last_end + 1] path.append(end) path.append(last_end) path.append(end) print path word_vector = [] for i in xrange(self.N + final_offset -1, -1, -1): word_vector.append(words[self.N + final_offset -1 - i][path[i]+1]) print word_vector return word_vector def endOfSentence(self, lm, word): prob = 0 for separator in [',', '.']: prob += lm.prob(separator, word) return prob def most_similar_words(self, word): prob_list = {} jobs = self.paralize(word) prob_list.update(jobs) sorted_prob = sorted( prob_list.items(), key = operator.itemgetter(1), reverse=True ) return sorted_prob[:self.M] def paralize(self, word): parts = 17 jobs = [] for index in xrange(parts): for i in xrange(3): if i == 1: punish = 1 else: punish = 0.05 length = len(word) + i - 1 if length <= 0: _list = [] else: _list = split_dict(self.word_dict[length], length, parts - 1, index) jobs.append( self.job_server.submit( populate_prob, (_list, self.matrixE, unigram, length, word, {}, substitue, punish,), (substitue, insert, delete, probaWord, weightedPopularity, charToNum,), ("nltk",) ) ) self.job_server.wait() stats = {} for job in jobs: stats.update(job()) #self.job_server.print_stats() return stats
def train(self, train_path): """Train based on co-occurences in the provided data.""" term_symbols = [] # Process input with open(train_path, "Ur") as train_file: for line in train_file: line_symbols = line.split() # Learn co-occurences and precedes/follows for idx, sym1 in enumerate(line_symbols): # Count the symbol self.counts[sym1] += 1 # Get the sets of the other items preceding_symbols = set(line_symbols[:idx]) following_symbols = set(line_symbols[idx + 1:]) other_symbols = preceding_symbols | following_symbols # Count cooccurences for sym2 in other_symbols: self.cooccurs[sym1][sym2] += 1 # Mark before/after for sym2 in preceding_symbols: self.before[sym1].add(sym2) for sym2 in following_symbols: self.after[sym1].add(sym2) # Remove if one of the always relationships does # not hold up for sym2 in SYMBOLS: if sym2 not in preceding_symbols: try: self.mustprecede[sym1].remove(sym2) except KeyError: pass if sym2 not in following_symbols: try: self.mustfollow[sym1].remove(sym2) except KeyError: pass # Add beginning and end terminators for n-grams term_symbols.extend([START_SYM] + line_symbols + [END_SYM]) # Learn # Requires/excludes and precedes/follows for sym1 in SYMBOLS: for sym2 in SYMBOLS: # Co-occurence counts imply requires/excludes count1 = self.counts[sym1] if self.cooccurs[sym1][sym2] == count1: self.requires[sym1].add(sym2) elif self.cooccurs[sym1][sym2] == 0: self.excludes[sym1].add(sym2) # Figure out what cannot precede/follow if sym2 not in self.before[sym1]: self.noprecede[sym1].add(sym2) if sym2 not in self.after[sym1]: self.nofollow[sym1].add(sym2) # N-gram model self.ngram = NgramModel(2, term_symbols)
class LVGNgramGenerator: """ Lemmatized vocabulary and grammar ngram-based generator """ def __init__(self, tuples, n): """ Parameters ---------- tuples : Iterable[(str, str, str)] A list of (word, lemma, tag) tuples from which to learn a model n : int Maximum size of n-grams to use in NgramModel """ self._n = n print("Creating models... (this may take some time)"); self._make_models(tuples) print("Done!"); def _make_models(self, tuples): self._word_ids = WordIdDictionary() # Extract sequence of words, lemmas, and tags words, lemmas, tags = tuple(map(lambda tokens: list( self._word_ids.add_words_transform(tokens)), zip(*tuples))) self._tags = tags # Create models for words, lemmas, and tags self._words_ngram = NgramModel(words, self._n) self._lemmas_ngram = NgramModel(lemmas, self._n) self._tags_ngram = NgramModel(tags, 2 * self._n) # Can afford to use 2 * n-gram size for grammar # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively # It's faster to use a list than predicate on unigrams during backoff search self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas)) self._tag_lemma_words = ConditionalFreqDist( zip(zip(tags, lemmas), words)) def generate_without_pos(self, n): """ Generate n words without using any special POS information """ # Just use words NgramModel generate function generated_words = self._words_ngram.generate(n) return list(self._word_ids.transform_ids(generated_words)) def generate(self, n): """ Generate n words using copied grammar, generated lemmas, and words based on lemmas """ start = random.randint(n, len(self._tags) - n) generated_tags = self._tags[start : start + n] # Copy a random section of POS tags for grammar # Generate sequence of lemmas based off of grammar generated_lemmas = [] for tag in generated_tags: # Search for and choose a lemma with correct tag choice = self._lemmas_ngram.choose_word( generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[tag]) if choice is None: # Could not find a good lemma for current POS tag, choose from list choice = MLEProbDist(self._tag_lemmas[tag]).generate() generated_lemmas.append(choice) # Generate sequence of words based off of lemmas and grammar generated_words = [] for (tag, lemma) in zip(generated_tags, generated_lemmas): # Search for and choose word with correct lemma/tag choices = self._words_ngram.backoff_search( generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[(tag, lemma)]) if choices is None: # Could not find a good word, choose from list choices = self._tag_lemma_words[(tag, lemma)] generated_words.append(MLEProbDist(choices).generate()) return list(self._word_ids.transform_ids(generated_words)) def generate_alternative(self, n): """ Generate n words using a more complicated algorithm """ generated_tags = [] generated_lemmas = [] generated_words = [] # Incrementally generate (tag, lemma) pairs for i in range(n): tag_choice = None # Start with nothing # Loop through n-grams of grammar size = 2 * self._n while size > 2: tag_choices = self._tags_ngram.backoff_search( generated_tags, backoff_limit=2, predicate=lambda tag: True, start_n=size) # Determine valid lemmas in context with these tag choices tag_to_lemma = {} if tag_choices is not None: for tag, _ in tag_choices.items(): # For each tag, find valid lemmas in context with that tag lemma = self._lemmas_ngram.choose_word( generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[tag]) if lemma is not None: tag_to_lemma[tag] = lemma if len(tag_to_lemma) > 1: # We have found valid (tag, lemma) pairs tag_probdist = MLEProbDist(FreqDist( {tag: freq for tag, freq in tag_choices.items() if tag in tag_to_lemma})) tag_choice = tag_probdist.generate() # Randomly select the tag lemma_choice = tag_to_lemma[tag_choice] # Set the lemma break size -= 1 # Lower to smaller n-gram for more tag choices if tag_choice is None: # We still didn't find a valid (tag, lemma) pair, fallback tag_choice = MLEProbDist(tag_choices).generate() lemma_choice = MLEProbDist( self._tag_lemmas[tag_choice]).generate() generated_tags.append(tag_choice) generated_lemmas.append(lemma_choice) # Generate all words based on (tag, lemma) pairs for (tag, lemma) in zip(generated_tags, generated_lemmas): # Search for and choose word with correct lemma/tag choices = self._words_ngram.backoff_search( generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[(tag, lemma)]) if choices is None: # Could not find a good word, choose from list choices = self._tag_lemma_words[(tag, lemma)] generated_words.append(MLEProbDist(choices).generate()) return list(self._word_ids.transform_ids(generated_words))
def test_model(n, inits): ngram = NgramModel(n, brown.words(), selector) with open("out" + str(n), 'w') as outf: for i in inits: print(' '.join(ngram.generate_sentence(i)), file=outf)
def main(): """ Trains and evaluates neural language models on the Microsoft Sentence Completion Challenge dataset. Allowed cmd-line flags: -s TS_FILES : Uses the reduced trainsed (TS_FILES trainset files) -o MIN_OCCUR : Only uses terms that occur MIN_OCCUR or more times in the trainset. Other terms are replaced with a special token. -f MIN_FILES : Only uses terms that occur in MIN_FILES or more files in the trainset. Other terms are replaced with a special token. -n : n-gram length (default 4) -t : Use tree-grams (default does not ues tree-grams) -u FTRS : Features to use. FTRS must be a string composed of zeros and ones, of length 5. Ones indicate usage of following features: (word, lemma, google_pos, penn_pos, dependency_type), respectively. Neural-net specific cmd-line flags: -ep EPOCHS : Number of training epochs, defaults to 20. -eps EPS : Learning rate, defaults to 0.005. -mnb MBN_SIZE : Size of the minibatch, defaults to 2000. """ logging.basicConfig(level=logging.INFO) log.info("Evaluating model") # get the data handling parameters ts_reduction = util.argv('-s', None, int) min_occ = util.argv('-o', 5, int) min_files = util.argv('-f', 2, int) n = util.argv('-n', 4, int) use_tree = '-t' in sys.argv bool_format = lambda s: s.lower() in ["1", "true", "yes", "t", "y"] ft_format = lambda s: map(bool_format, s) ftr_use = np.array(util.argv('-u', ft_format("001000"), ft_format)) val_per_epoch = util.argv('-v', 10, int) # nnets only support one-feature ngrams assert ftr_use.sum() == 1 # get nnet training parameters use_lbl = '-l' in sys.argv epochs = util.argv('-ep', 20, int) eps = util.argv('-eps', 0.002, float) mnb_size = util.argv('-mnb', 2000, int) n_hid = util.argv('-h', 1000, int) d = util.argv('-d', 100, int) # load data ngrams, q_groups, answers, feature_sizes = data.load_ngrams( n, ftr_use, use_tree, subset=ts_reduction, min_occ=min_occ, min_files=min_files) used_ftr_sizes = feature_sizes[ftr_use] # remember, we only use one feature vocab_size = used_ftr_sizes[0] log.info("Data loaded, %d ngrams", ngrams.shape[0]) # split data into sets x_train, x_valid, x_test = util.dataset_split(ngrams, 0.05, 0.05, rng=456) # generate a version of the validation set that has # the first term (the conditioned one) randomized # w.r.t. unigram distribution # so first create the unigram distribution, no smoothing unigrams_data = data.load_ngrams(1, ftr_use, False, subset=ts_reduction, min_occ=min_occ, min_files=min_files)[0] unigrams_data = NgramModel(1, False, ftr_use, feature_sizes, ts_reduction, min_occ, min_files, 0.0, 0.0, unigrams_data) unigrams_dist = unigrams_data.probability_additive( np.arange(vocab_size).reshape(vocab_size, 1)) unigrams_dist /= unigrams_dist.sum() # finally, generate validation sets with randomized term x_valid_r = random_ngrams(x_valid, vocab_size, False, unigrams_dist) # the directory for this model dir = "%s_%s_%d-gram_features-%s_data-subset_%r-min_occ_%r-min_files_%r"\ % ("llbl" if use_lbl else "lmlp", "tree" if use_tree else "linear", n, "".join([str(int(b)) for b in ftr_use]), ts_reduction, min_occ, min_files) dir = os.path.join(_DIR, dir) if not os.path.exists(dir): os.makedirs(dir) # filename base for this model file = "nhid-%d_d-%d_train_mnb-%d_epochs-%d_eps-%.5f" % ( n_hid, d, mnb_size, epochs, eps) # store the logs if False: log_file_handler = logging.FileHandler( os.path.join(dir, file + ".log")) log_file_handler.setLevel(logging.INFO) logging.root.addHandler(log_file_handler) # we will plot log-lik ratios for every _VALIDATE_MNB minibatches # we will also plot true mean log-lik valid_on = {"x_valid": x_valid[:_LL_SIZE], "x_valid_r": x_valid_r[ :_LL_SIZE], "x_train": x_train[:_LL_SIZE]} valid_ll = {k: [] for k in valid_on.keys()} valid_p_mean = {k: [] for k in valid_on.keys()} # how often we validate mnb_count = (x_train.shape[0] - 1) / mnb_size + 1 _VALIDATE_MNB = mnb_count / val_per_epoch def mnb_callback(net, epoch, mnb): """ Callback function called after every minibatch. """ if (mnb + 1) % _VALIDATE_MNB: return # calculate log likelihood using the exact probability probability_f = theano.function([net.input], net.probability) for name, valid_set in valid_on.iteritems(): p = probability_f(valid_set) valid_ll[name].append(np.log(p).mean()) valid_p_mean[name].append(p.mean()) log.info('Epoch %d, mnb: %d, x_valid mean-log-lik: %.5f' ' , x_valid p-mean: %.5f' ' , ln(p(x_valid) / p(x_valid_r).mean(): %.5f', epoch, mnb, valid_ll["x_valid"][-1], valid_p_mean["x_valid"][-1], valid_ll["x_valid"][-1] - valid_ll["x_valid_r"][-1]) # track if the model progresses on the sentence completion challenge # sent_challenge = [] def epoch_callback(net, epoch): # log some info about the parameters, just so we know param_mean_std = [(k, v.mean(), v.std()) for k, v in net.params().iteritems()] log.info("Epoch %d: %s", epoch, "".join( ["\n\t%s: %.5f +- %.5f" % pms for pms in param_mean_std])) # evaluate model on the sentence completion challenge # probability_f = theano.function([net.input], net.probability) # qg_log_lik = [[np.log(probability_f(q)).sum() for q in q_g] # for q_g in q_groups] # predictions = map(lambda q_g: np.argmax(q_g), qg_log_lik) # sent_challenge.append((np.array(predictions) == answers).mean()) # log.info('Epoch %d sentence completion eval score: %.4f', # epoch, sent_challenge[-1]) log.info("Creating model") if use_lbl: net = LLBL(n, vocab_size, d, 12345) else: net = LMLP(n, vocab_size, d, 12345) net.mnb_callback = mnb_callback net.epoch_callback = epoch_callback train_cost, valid_cost, _ = net.train( x_train, x_valid, mnb_size, epochs, eps) # plot training progress info # first we need values for the x-axis (minibatch count) mnb_count = (x_train.shape[0] - 1) / mnb_size + 1 mnb_valid_ep = mnb_count / _VALIDATE_MNB x_axis_mnb = np.tile((np.arange(mnb_valid_ep) + 1) * _VALIDATE_MNB, epochs) x_axis_mnb += np.repeat(np.arange(epochs) * mnb_count, mnb_valid_ep) x_axis_mnb = np.hstack(([0], x_axis_mnb)) plt.figure(figsize=(16, 12)) plt.subplot(221) plt.plot(mnb_count * (np.arange(epochs) + 1), train_cost, 'b-', label='train') plt.plot(mnb_count * (np.arange(epochs) + 1), valid_cost, 'g-', label='valid') plt.axhline(min(valid_cost), linestyle='--', color='g') plt.yticks(list(plt.yticks()[0]) + [min(valid_cost)]) plt.title('cost') plt.grid() plt.legend(loc=1) plt.subplot(222) for name, valid_set in valid_ll.items(): plt.plot(x_axis_mnb, valid_set, label=name) plt.ylim((np.log(0.5 / vocab_size), max([max(v) for v in valid_ll.values()]) + 0.5)) plt.axhline(max(valid_ll["x_valid"]), linestyle='--', color='g') plt.yticks(list(plt.yticks()[0]) + [max(valid_ll["x_valid"])]) plt.title('log-likelihood(x)') plt.grid() plt.legend(loc=4) plt.subplot(224) for name, valid_set in valid_p_mean.items(): plt.plot(x_axis_mnb, valid_set, label=name) plt.title('p(x).mean()') plt.grid() plt.legend(loc=4) # plt.subplot(224) # plt.plot(mnb_count * np.arange(epochs + 1), sent_challenge, 'g-') # plt.title('sent_challenge') # plt.grid() plt.savefig(os.path.join(dir, file + ".pdf"))
test_text.extend(sentences) else: test_text.append(txt) #print test_files print len(test_files) total_train_files = [] TOTAL = INCREMENT UPPER_LIMIT = 500 while len(total_train_files) < UPPER_LIMIT: total_train_files = train_files[:TOTAL] data_set_corpus = PlaintextCorpusReader(sys.argv[1], total_train_files) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(3, data_set_corpus.words(), estimator) #lm = NgramModel(2, data_set_corpus.words(), estimator) P = [] for s in test_text: s_tokens = nltk.word_tokenize(s) if SENTENCE: #if len(s_tokens) > 3: if len(s_tokens) > 10: p = lm.perplexity(s_tokens) P.append(p) else: p = lm.perplexity(s_tokens) P.append(p) TOTAL += INCREMENT
with open(file=path, mode="wb") as fp: fp.write(pickle.dumps(obj=params_dict)) @classmethod def load(cls, path): """ 加载模型 :param path: 保存路径 :return: """ params_dict = pickle.load(open(file=path, mode="rb")) lookup_table = params_dict['_lookup_table'] ngram_model = pickle.loads(params_dict['_ngram_model_pickle'], fix_imports=True) return cls(ngram_model=ngram_model, lookup_table=lookup_table) if __name__ == '__main__': from nltk.text import Text from nltk.corpus import gutenberg text1 = Text(gutenberg.words('melville-moby_dick.txt')) # ngramCounter = NgramCounter(order=2, train=text1) ngramModel = NgramModel(ngram_counter=ngramCounter) corrector = NgramCorrector(ngram_model=ngramModel) print(corrector.correct(['I', 'dooo', 'think', 'you', 'rre', 'goooood'])) corrector2 = NgramCorrector.load("123") print(corrector2.correct(['I', 'don', 'think', 'you', 'rre', 'goooood']))
class LVGNgramGenerator: """ Lemmatized vocabulary and grammar ngram-based generator """ def __init__(self, tuples, n): """ Parameters ---------- tuples : Iterable[(str, str, str)] A list of (word, lemma, tag) tuples from which to learn a model n : int Maximum size of n-grams to use in NgramModel """ self._n = n print("Creating models... (this may take some time)") self._make_models(tuples) print("Done!") def _make_models(self, tuples): self._word_ids = WordIdDictionary() # Extract sequence of words, lemmas, and tags words, lemmas, tags = tuple( map( lambda tokens: list(self._word_ids.add_words_transform(tokens) ), zip(*tuples))) self._tags = tags # Create models for words, lemmas, and tags self._words_ngram = NgramModel(words, self._n) self._lemmas_ngram = NgramModel(lemmas, self._n) self._tags_ngram = NgramModel( tags, 2 * self._n) # Can afford to use 2 * n-gram size for grammar # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively # It's faster to use a list than predicate on unigrams during backoff search self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas)) self._tag_lemma_words = ConditionalFreqDist( zip(zip(tags, lemmas), words)) def generate_without_pos(self, n): """ Generate n words without using any special POS information """ # Just use words NgramModel generate function generated_words = self._words_ngram.generate(n) return list(self._word_ids.transform_ids(generated_words)) def generate(self, n): """ Generate n words using copied grammar, generated lemmas, and words based on lemmas """ start = random.randint(n, len(self._tags) - n) generated_tags = self._tags[ start:start + n] # Copy a random section of POS tags for grammar # Generate sequence of lemmas based off of grammar generated_lemmas = [] for tag in generated_tags: # Search for and choose a lemma with correct tag choice = self._lemmas_ngram.choose_word( generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[tag]) if choice is None: # Could not find a good lemma for current POS tag, choose from list choice = MLEProbDist(self._tag_lemmas[tag]).generate() generated_lemmas.append(choice) # Generate sequence of words based off of lemmas and grammar generated_words = [] for (tag, lemma) in zip(generated_tags, generated_lemmas): # Search for and choose word with correct lemma/tag choices = self._words_ngram.backoff_search( generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[ (tag, lemma)]) if choices is None: # Could not find a good word, choose from list choices = self._tag_lemma_words[(tag, lemma)] generated_words.append(MLEProbDist(choices).generate()) return list(self._word_ids.transform_ids(generated_words)) def generate_alternative(self, n): """ Generate n words using a more complicated algorithm """ generated_tags = [] generated_lemmas = [] generated_words = [] # Incrementally generate (tag, lemma) pairs for i in range(n): tag_choice = None # Start with nothing # Loop through n-grams of grammar size = 2 * self._n while size > 2: tag_choices = self._tags_ngram.backoff_search( generated_tags, backoff_limit=2, predicate=lambda tag: True, start_n=size) # Determine valid lemmas in context with these tag choices tag_to_lemma = {} if tag_choices is not None: for tag, _ in tag_choices.items(): # For each tag, find valid lemmas in context with that tag lemma = self._lemmas_ngram.choose_word( generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[ tag]) if lemma is not None: tag_to_lemma[tag] = lemma if len(tag_to_lemma) > 1: # We have found valid (tag, lemma) pairs tag_probdist = MLEProbDist( FreqDist({ tag: freq for tag, freq in tag_choices.items() if tag in tag_to_lemma })) tag_choice = tag_probdist.generate( ) # Randomly select the tag lemma_choice = tag_to_lemma[ tag_choice] # Set the lemma break size -= 1 # Lower to smaller n-gram for more tag choices if tag_choice is None: # We still didn't find a valid (tag, lemma) pair, fallback tag_choice = MLEProbDist(tag_choices).generate() lemma_choice = MLEProbDist( self._tag_lemmas[tag_choice]).generate() generated_tags.append(tag_choice) generated_lemmas.append(lemma_choice) # Generate all words based on (tag, lemma) pairs for (tag, lemma) in zip(generated_tags, generated_lemmas): # Search for and choose word with correct lemma/tag choices = self._words_ngram.backoff_search( generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[ (tag, lemma)]) if choices is None: # Could not find a good word, choose from list choices = self._tag_lemma_words[(tag, lemma)] generated_words.append(MLEProbDist(choices).generate()) return list(self._word_ids.transform_ids(generated_words))
class AGLearner: """A simple artificial language grammar learner.""" def __init__(self): # Input counts # Number of times each symbol is seen self.counts = defaultdict(int) # Number of times symbols co-occur self.cooccurs = defaultdict(lambda: defaultdict(int)) # Whether a symbol is observed before or after another symbol self.before = {sym: set() for sym in SYMBOLS} self.after = {sym: set() for sym in SYMBOLS} # Learning structures self.requires = defaultdict(set) self.excludes = defaultdict(set) self.noprecede = defaultdict(set) self.nofollow = defaultdict(set) self.mustprecede = {sym: set(SYMBOLS) for sym in SYMBOLS} self.mustfollow = {sym: set(SYMBOLS) for sym in SYMBOLS} self.ngram = None def train(self, train_path): """Train based on co-occurences in the provided data.""" term_symbols = [] # Process input with open(train_path, "Ur") as train_file: for line in train_file: line_symbols = line.split() # Learn co-occurences and precedes/follows for idx, sym1 in enumerate(line_symbols): # Count the symbol self.counts[sym1] += 1 # Get the sets of the other items preceding_symbols = set(line_symbols[:idx]) following_symbols = set(line_symbols[idx + 1:]) other_symbols = preceding_symbols | following_symbols # Count cooccurences for sym2 in other_symbols: self.cooccurs[sym1][sym2] += 1 # Mark before/after for sym2 in preceding_symbols: self.before[sym1].add(sym2) for sym2 in following_symbols: self.after[sym1].add(sym2) # Remove if one of the always relationships does # not hold up for sym2 in SYMBOLS: if sym2 not in preceding_symbols: try: self.mustprecede[sym1].remove(sym2) except KeyError: pass if sym2 not in following_symbols: try: self.mustfollow[sym1].remove(sym2) except KeyError: pass # Add beginning and end terminators for n-grams term_symbols.extend([START_SYM] + line_symbols + [END_SYM]) # Learn # Requires/excludes and precedes/follows for sym1 in SYMBOLS: for sym2 in SYMBOLS: # Co-occurence counts imply requires/excludes count1 = self.counts[sym1] if self.cooccurs[sym1][sym2] == count1: self.requires[sym1].add(sym2) elif self.cooccurs[sym1][sym2] == 0: self.excludes[sym1].add(sym2) # Figure out what cannot precede/follow if sym2 not in self.before[sym1]: self.noprecede[sym1].add(sym2) if sym2 not in self.after[sym1]: self.nofollow[sym1].add(sym2) # N-gram model self.ngram = NgramModel(2, term_symbols) def report(self): """Report the rules learned.""" print "Co-occurence rules:" for sym in SYMBOLS: print sym, "requires", ', '.join(sorted(self.requires[sym])) print sym, "excludes", ','.join(sorted(self.excludes[sym])) print print "Linear precedence rules:" for sym in SYMBOLS: print sym, "cannot be preceded by", ', '.join(sorted(self.noprecede[sym])) print sym, "cannot be followed by", ', '.join(sorted(self.nofollow[sym])) print print "N-grams:" for event, context, prob in self.ngram.allngrams(): print "{0} -> {1}: {2}".format(' '.join(context), event, prob) def test(self, test_path, out_path): """Test on a file""" test_file = open(test_path, "Ur") out_file = open(out_path, "w") header = ["Sentence", "Gold response", "Co-occur response", "Co-occur reason", "Linear response", "Linear reason", "N-gram prob."] print >> out_file, "\t".join(header) for line in test_file: sent, gold = line.strip().split(',') gold = (gold.strip() == "True") line_symbols = sent.split() line_symbols_term = [START_SYM] + line_symbols + [END_SYM] # Decode violations cooccur_ok = True cooccur_reasons = set() linear_ok = True linear_reasons = set() for idx, sym1 in enumerate(line_symbols): # Get the sets of the other items preceding_symbols = set(line_symbols[:idx]) following_symbols = set(line_symbols[idx + 1:]) other_symbols = preceding_symbols | following_symbols # Check requirements and exclusions for each pair # Excluded symbols that are present for sym2 in self.excludes[sym1] & other_symbols: cooccur_ok = False cooccur_reasons.add("{0} excludes {1}".format(sym1, sym2)) # Required symbols that are missing for sym2 in self.requires[sym1] & (set(SYMBOLS) - other_symbols): cooccur_ok = False cooccur_reasons.add("{0} requires {1}".format(sym1, sym2)) # Check that preceding/following symbols are okay for prec_sym in preceding_symbols: if prec_sym in self.noprecede[sym1]: linear_ok = False linear_reasons.add("{1} cannot precede {0}".format(sym1, prec_sym)) for fol_sym in following_symbols: if fol_sym in self.nofollow[sym1]: linear_ok = False linear_reasons.add("{1} cannot follow {0}".format(sym1, fol_sym)) # Check for missing preceding/following symbols for prec_sym in self.mustprecede[sym1] - preceding_symbols: linear_ok = False linear_reasons.add("{1} must precede {0}".format(sym1, prec_sym)) for fol_sym in self.mustfollow[sym1] - following_symbols: linear_ok = False linear_reasons.add("{1} must follow {0}".format(sym1, fol_sym)) # N-gram statistics prob = self.ngram.seqprob(line_symbols_term) # Output print >> out_file, "\t".join([" ".join(line_symbols), str(gold), str(cooccur_ok), ", ".join(cooccur_reasons), str(linear_ok), ", ".join(linear_reasons), str(prob)]) # Clean up test_file.close() out_file.close()