def generate_alternative(self, n): """ Generate n words using a more complicated algorithm """ generated_tags = [] generated_lemmas = [] generated_words = [] # Incrementally generate (tag, lemma) pairs for i in range(n): tag_choice = None # Start with nothing # Loop through n-grams of grammar size = 2 * self._n while size > 2: tag_choices = self._tags_ngram.backoff_search( generated_tags, backoff_limit=2, predicate=lambda tag: True, start_n=size) # Determine valid lemmas in context with these tag choices tag_to_lemma = {} if tag_choices is not None: for tag, _ in tag_choices.items(): # For each tag, find valid lemmas in context with that tag lemma = self._lemmas_ngram.choose_word( generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[tag]) if lemma is not None: tag_to_lemma[tag] = lemma if len(tag_to_lemma) > 1: # We have found valid (tag, lemma) pairs tag_probdist = MLEProbDist(FreqDist( {tag: freq for tag, freq in tag_choices.items() if tag in tag_to_lemma})) tag_choice = tag_probdist.generate() # Randomly select the tag lemma_choice = tag_to_lemma[tag_choice] # Set the lemma break size -= 1 # Lower to smaller n-gram for more tag choices if tag_choice is None: # We still didn't find a valid (tag, lemma) pair, fallback tag_choice = MLEProbDist(tag_choices).generate() lemma_choice = MLEProbDist( self._tag_lemmas[tag_choice]).generate() generated_tags.append(tag_choice) generated_lemmas.append(lemma_choice) # Generate all words based on (tag, lemma) pairs for (tag, lemma) in zip(generated_tags, generated_lemmas): # Search for and choose word with correct lemma/tag choices = self._words_ngram.backoff_search( generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[(tag, lemma)]) if choices is None: # Could not find a good word, choose from list choices = self._tag_lemma_words[(tag, lemma)] generated_words.append(MLEProbDist(choices).generate()) return list(self._word_ids.transform_ids(generated_words))
def add_individual(number_individuals, res_address, diagnosis): total_individuals = [] new_address = res_address.sample(number_individuals).to_dict('records') for idx in xrange(number_individuals): diagnosis_freq_dist = FreqDist(diagnosis) diagnosis_prob_dist = MLEProbDist(diagnosis_freq_dist) diagnosis_random = diagnosis_prob_dist.generate() full_address = new_address[idx]['ADDR_FULL'] + '|' + new_address[idx]['CTYNAME'] + '|' + new_address[idx]['ZIP5'] gender, age = get_gender_age(new_address[idx]) new_individual = {'Date_Inf': current_date, 'Gender': gender, 'Age': age, 'Census_Tract': new_address[idx]['GEOID'], 'Address':full_address, 'LON':new_address[idx]['LON'], 'LAT':new_address[idx]['LAT'], 'Diagnosis': diagnosis_random} total_individuals.append(new_individual) return pd.DataFrame.from_records(total_individuals)
def get_gender_age(full_address): GEOID = full_address['GEOID'] try: age_gender_dist = KC_age_gender.loc[[GEOID]].loc[:,'M0-4':'F85-120'] age_gender_freq_dist = FreqDist(age_gender_dist) age_gender_prob_dist_age_gender = MLEProbDist(age_gender_freq_dist) age_gender_random = age_gender_prob_dist_age_gender.generate() gender = age_gender_random[0] age = age_gender_random[1:] return gender, age except: return np.nan, np.nan
def gen_sent(ngram): global lis # "n" contains the ngram number n = lis[1] #number of required sentences is stored in sent_num sent_num = lis[2] i = 0 for i in range(sent_num): j = True # we are using this window to go through the sentence with n-1 previous # words stored in the window window = [] sent = "" for size in range(n - 1): window.append('<start>') while j == True: tup_win = tuple(window) if tup_win not in ngram.keys(): sys.exit("We don't have a start line") # FreqDist and MLEProbDist function will transform the frequencies to probabilities # by performing (item freq/ sum of frequencies) freq_dist = FreqDist(ngram[tup_win]) #prob_dist.generate() will take in the freq-distance and generate a random token # according to the distribution prob_dist = MLEProbDist(freq_dist) next_w = prob_dist.generate() #the following condition is used to detect the end of line if (next_w == "." or next_w == "?" or next_w == "!"): j = False sent += next_w continue #We'd like to make sure the apostrophe token has no space before or after it... # ... as well as the begining of the line elif (next_w == "m" or next_w == "s" or next_w == "re" or next_w == "," or next_w == "’" or next_w == "ve" or next_w == "t" or tup_win[-1] == '<start>'): sent += next_w else: sent += " %s" % next_w #moving the window forward by popping and appending window.pop(0) window.append(next_w) print("\nSentence %s:\n%s" % (i + 1, sent))
def sentence_generator(gramFreq,numofsentences): i = 0 for i in range (numofsentences): sentenceGen = True sentencelist = () generateSentence = "" for size in range (int(ngrams)-1): sentencelist += ('<start>',) while sentenceGen == True: token_dict = {} for index, val in ngrams_frequency.items(): index2 = index[:-1] if index2 == sentencelist: token_dict.update({index[-1]: val}) # generating frequency using the function frequencyDistribution = FreqDist(token_dict) # generating probability using the function probabilityDistribution = MLEProbDist(frequencyDistribution) # predicting the next word next_word = probabilityDistribution.generate() # words having ".,?,!" if (next_word =="." or next_word == "?" or next_word == "!"): sentenceGen = False generateSentence += next_word continue # words having , ' elif (next_word == "," or next_word == "’"): generateSentence += next_word else: generateSentence += " %s"%next_word if len(sentencelist) != 0 : my_list = list(sentencelist) my_list.pop(0) my_list.append(next_word) sentencelist = tuple(my_list) # Display sentences print ("\nSentence %s: %s"%(i+1,generateSentence))
def gen_sentence(ngram): global arg i = 0 # n in ngrams n = arg[1] # number of sentences to generate m = arg[2] for i in range(m): j = True table = [] sentence = "" for size in range(n - 1): table.append('<START>') while j == True: tuple_table = tuple(table) if tuple_table not in ngram.keys(): # when start is not available sys.exit("No start line!") # generating frequency frequency = FreqDist(ngram[tuple_table]) # generating probability probability = MLEProbDist(frequency) # predicting the next word pred_word = probability.generate() # words having ".,?,!" if (pred_word == "." or pred_word == "?" or pred_word == "!"): j = False sentence += pred_word continue # words having , ' or START tag elif (pred_word == "," or pred_word == "’" or tuple_table[-1] == '<START>'): sentence += pred_word else: sentence += " %s" % pred_word table.pop(0) table.append(pred_word) # Display sentences print("\nSentence %s:\n%s" % (i + 1, sentence))
def generate_alternative(self, n): """ Generate n words using a more complicated algorithm """ generated_tags = [] generated_lemmas = [] generated_words = [] # Incrementally generate (tag, lemma) pairs for i in range(n): tag_choice = None # Start with nothing # Loop through n-grams of grammar size = 2 * self._n while size > 2: tag_choices = self._tags_ngram.backoff_search( generated_tags, backoff_limit=2, predicate=lambda tag: True, start_n=size) # Determine valid lemmas in context with these tag choices tag_to_lemma = {} if tag_choices is not None: for tag, _ in tag_choices.items(): # For each tag, find valid lemmas in context with that tag lemma = self._lemmas_ngram.choose_word( generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[ tag]) if lemma is not None: tag_to_lemma[tag] = lemma if len(tag_to_lemma) > 1: # We have found valid (tag, lemma) pairs tag_probdist = MLEProbDist( FreqDist({ tag: freq for tag, freq in tag_choices.items() if tag in tag_to_lemma })) tag_choice = tag_probdist.generate( ) # Randomly select the tag lemma_choice = tag_to_lemma[ tag_choice] # Set the lemma break size -= 1 # Lower to smaller n-gram for more tag choices if tag_choice is None: # We still didn't find a valid (tag, lemma) pair, fallback tag_choice = MLEProbDist(tag_choices).generate() lemma_choice = MLEProbDist( self._tag_lemmas[tag_choice]).generate() generated_tags.append(tag_choice) generated_lemmas.append(lemma_choice) # Generate all words based on (tag, lemma) pairs for (tag, lemma) in zip(generated_tags, generated_lemmas): # Search for and choose word with correct lemma/tag choices = self._words_ngram.backoff_search( generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[ (tag, lemma)]) if choices is None: # Could not find a good word, choose from list choices = self._tag_lemma_words[(tag, lemma)] generated_words.append(MLEProbDist(choices).generate()) return list(self._word_ids.transform_ids(generated_words))
import pickle import pandas as pd from nltk.probability import FreqDist, MLEProbDist KC_age_gender = pd.read_pickle('input/KC_CT_age_gender.pickle') #print KC_age_gender print KC_age_gender.loc[['53033032800']].loc[:,'M0-5':'F85-120'] age_gender_dist = KC_age_gender.loc[['53033032800']].loc[:,'M0-5':'F85-120'] freq_dist = FreqDist(age_gender_dist) prob_dist = MLEProbDist(freq_dist) print prob_dist.generate()