class MarkovOnTopic(object): ''' TODO: ngrams bumped to four or five-grams TODO: first ngram is the technical noun topic of the tweet TODO: stem the topics to narrow them. or even better find a way to standardize synonyms to the same word and then stem TODO: db key has no hashmarks but the values do TODO: db key in lowercase, but values are original ''' def __init__(self, db_path='markov.db'): try: self.mc = MarkovChain(db_path, verbose=False) except: print('No database found at path. Creating new database.') self.mc = seed_db(db_path) def generate_db(self, docs, filename=None): self.docs = docs pass def generate_topics(self): pass def generate_string(self, seed=None): regen = True while regen: if seed: gen_text = self.mc.generateStringWithSeed(seed) else: gen_text = self.mc.generateString() if not drop(gen_text): print gen_text regen = False
def calc_markov(self, file): try: with open(file, encoding="utf-8") as f: propabilities = MarkovChain.get_words_propabilities(f.read()) except FileNotFoundError: return BAD_FILE_ERROR init_word = random.choice(list(propabilities.keys())) mc = MarkovChain(MarkovState(init_word)) for word, prop in propabilities.items(): state_1 = MarkovState(word) for word_2, value in prop.items(): state_2 = MarkovState(word_2) mc.add_probability(state_1, state_2, value) words_treshold = self.sentence_count * 25 result = init_word.capitalize() for word in mc: if self.get_sentence_count(result) >= self.sentence_count: break if result.count(" ") > words_treshold: return BAD_FILE_ERROR if result.endswith("."): word = word.capitalize() result += f" {word}" return result
def main(): """ Executes a MarkovChain for text generation. Then it will be wait for user input. If length of user input less than `window` parameter of the chain, then random text will be generated, else last 3 words of the input will be taken as a start of generated text. If you want break the process, then enter `48598ee283437e810f2f0eb1cf66e217`. """ chain = MarkovChain() # path relative to command line that executes that script. chain.chain = extensions.file.json.read("./src/markov-chain/generated-chains/ru/my-favorites-3-window.json") while True: start_text = input() if (start_text == "48598ee283437e810f2f0eb1cf66e217"): break # 3 - how many windows in the chain. start_text = handle_input_text(start_text, 3) if (start_text): print(chain.generate(start=start_text)) else: print(chain.generate())
def test_discrete_outdistr(self): q1 = np.array([1, 0]) A1 = np.array([[0.9, 0.1, 0], [0, 0.9, 0.1]]) mc = MarkovChain(q1, A1) pD_list = [ DiscreteDistr(np.array([0.6, 0.3, 0.1])), DiscreteDistr(np.array([0.1, 0.3, 0.6])) ] hmm1 = HMM(mc, pD_list) n_states = hmm1.n_states Z = np.array([1, 3, 2]) T = len(Z) pZ, _ = pD_list[0].prob(Z, pD_list) [alpha_hat, c] = mc.forward(pZ) expected_alpha_hat = np.array([[1.0000, 0.6000, 0.5625], [0, 0.4000, 0.4375]]) np.testing.assert_array_almost_equal(alpha_hat, expected_alpha_hat, decimal=4) beta_hat = mc.backward(pZ, c) expected_beta_hat = np.array([[1.6667, 1.5873, 0], [12.8571, 14.2857, 7.9365]]) np.testing.assert_array_almost_equal(beta_hat, expected_beta_hat, decimal=4) gamma = np.multiply(np.multiply(alpha_hat, beta_hat), np.tile(c[0:T], (n_states, 1))) # to check expected_gamma = np.array([[1.0000, 0.1429, 0], [0, 0.8571, 1.0000]]) np.testing.assert_array_almost_equal(gamma, expected_gamma, decimal=4)
def test_gauss_outdistr(self): p0 = np.array([1, 0]) A = np.array([[0.9, 0.1, 0], [0, 0.9, 0.1]]) mc = MarkovChain(p0, A) pD_list = [] pD_list.append(GaussDistr(mean=np.array([0]), std=np.array([1]))) pD_list.append(GaussDistr(mean=np.array([3]), std=np.array([2]))) h = HMM(mc, pD_list) n_states = h.n_states x = np.array([-0.2, 2.6, 1.3])[:, np.newaxis] T = x.shape[0] pX, logS = pD_list[0].prob(x, pD_list) alpha_hat, c = mc.forward(pX) beta_hat = mc.backward(pX, c) logP_hmm = logprob(h, x) pX_exp = np.array([[1.0000, 0.0695, 1.0000], [0.1418, 1.0000, 0.8111]]) np.testing.assert_array_almost_equal(pX, pX_exp, decimal=4) alpha_hat_exp = np.array([[1.0000, 0.3847, 0.4189], [0, 0.6153, 0.5811]]) np.testing.assert_array_almost_equal(alpha_hat, alpha_hat_exp, decimal=4) c_exp = np.array([1.0000, 0.1625, 0.8266, 0.0581]) np.testing.assert_array_almost_equal(c, c_exp, decimal=4) beta_hat_exp = np.array([[1.0000, 1.0389, 0], [8.4154, 9.3504, 2.0818]]) np.testing.assert_array_almost_equal(beta_hat, beta_hat_exp, decimal=4) logP_hmm_exp = np.array([-9.1877]) np.testing.assert_array_almost_equal(logP_hmm, logP_hmm_exp, decimal=4)
def __init__(self, goal): self.engine = GrammarEngine('./dialogueSystem/grammar/generator.txt') self.p_engine = GrammarEngine( './dialogueSystem/grammar/polarity_response.txt') f_dracula = open('./dialogueSystem/dracula.txt', encoding="utf8") whole_dracula = f_dracula.read() train_dracula = whole_dracula[:int(len(whole_dracula) * 0.8)] train_dracula_file = open('./dialogueSystem/train_dracula.txt', "w+", encoding="utf8") train_dracula_file.write(train_dracula) self.identity_chain = MarkovChain('./dialogueSystem/train_dracula.txt', "word", 3) if goal == "Friend": f_questions = open('./dialogueSystem/questionsFriendGoal.txt', encoding="utf8") else: # User f_questions = open('./dialogueSystem/questionsUserGoal.txt', encoding="utf8") self.questions = f_questions.read().splitlines() # print(self.questions) self.asked_questions = [] self.prev_count = -1 self.model = DialogTag('distilbert-base-uncased') # dialogue tags
def __init__(self, ml, subreddit, location): """Initialize the Markov Chain and writer ml: MarkovChain's max_links subreddit: The subreddit to use as the text source location: "posts"|"comments" - Whether to get the text from the top posts in the subreddit (faster), or from the children comments of the top posts in the subreddit (can return more text). sources: An array of strings for the MC """ rr = RedditReader(subreddit) if location == "posts": texts = rr.get_many_post_bodies() elif location == "comments": texts = rr.get_many_comment_bodies() else: raise TypeError('`location` must be either "posts" or "comments"') self.mc = MarkovChain(ml) for text in texts: self.mc.add_text(text) self.w = Writer(self.mc)
def make_leftright_hmm(n_states, pD, obs_data, l_data=None): """ Initialize and train a Hidden Markov Model to conform with a given set of training data sequence. Input: ------ n_states: Desired number of HMM states. pD: a single object of some probability-distribution class obs_data: [n_samples, n_features]. The concatenated training sequences. One sample of observed data vector is stored row-wise. l_data: [n_sequence, ]. l_data[r] is the length of rth training sequence. Return: hmm: the trained left-right hmm object """ if n_states <= 0: raise ValueError("Number of states must be >0") if l_data is None: l_data = [obs_data.shape[0]] # Just one single sequence # Make left-right Markov Chain with finite duration D = np.mean(l_data) / n_states # average state duration mc = MarkovChain() mc.init_left_right(n_states, D) hmm = HMM(mc, pD) hmm.init_leftright_outputdistr(obs_data, l_data) # crude initialize hmm.output_distr # standard training hmm.train(obs_data, l_data, 5, np.log(1.01)) return hmm
def generate_seedless_markov_sentence(): mc = MarkovChain(verbose=False) mc.generateDatabase((' '.join(get_text()))) sent = mc.generateString() if check_blacklist(sent): return '' else: return sentence_case(sent)
def generate_topic_markov_sentence(texts, index): topics = get_topics(texts, index) mc = MarkovChain(verbose=False) mc.generateDatabase((' '.join(get_text()))) sent = mc.generateStringWithTopics(topics) if check_blacklist(sent): return '' else: return sentence_case(sent)
def __init__(self, database, window_size): """ :param window_size: :type window_size: int """ self.model = MarkovChain(database, window_size) self.tokenizer = Tokenizer(Tokenizer.LoadStrategy(self.model.tokens)) self.model.set_tokenizer(self.tokenizer)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_dir', action='store', type=str, default=INPUT_DIR, help=f'Specify name of input image directory --in_path=path. Default: {INPUT_DIR}', dest='input_dir') parser.add_argument('-od', '--output_dir', action='store', type=str, default=OUTPUT_DIR, help=f'Specify name of output directory. Default: {OUTPUT_DIR}', dest='output_dir') parser.add_argument('-o', '--output_file', action='store', type=str, default=OUTPUT_FILENAME, help=f'Specify name of output file. Default: {OUTPUT_FILENAME}', dest='output_file') parser.add_argument('-d', '--img_dim', action='store', type=int, default=IMG_DIMS, help=f'Specify dimensions of square image to be created. Default: {IMG_DIMS}', dest='img_dims') args = parser.parse_args() sequence_length = args.img_dims ** 2 # current working directory cwd = os.getcwd() # fix problem from running from different directory if "scripts" in cwd: cwd = cwd.replace("scripts", "") dog_names = DOG_NAMES # probability of the first pixel (upper left corner) is each dog prior = PRIOR # transition matrix transition = [ROSIE_TRANSITION, CALLIE_TRANSITION, VENUS_TRANSITION, BEAR_TRANSITION, JAMIE_TRANSITION, COOPER_TRANSITION, WINSTON_TRANSITION, BRUNO_TRANSITION, MAISY_TRANSITION, SPEEDY_TRANSITION, BELLA_TRANSITION, BOOMER_TRANSITION, SASHA_TRANSITION] all_states = [] # create State object for each dog in dog names for i, dog in enumerate(dog_names): new_state = State(id=i, name=dog) all_states.append(new_state) # run markov chain using states, prior probability vector, and transition matrix m = MarkovChain(states=all_states, prior=prior, transition=transition) sequence = m.run(sequence_length=sequence_length) # create output_path variable output_path = os.path.join(cwd, args.output_dir) output_path = os.path.join(output_path, args.output_file) # generate image from markov chain, input images, and output_path i = ImgGenerator(order=sequence, input_dir=os.path.join(cwd, args.input_dir), output_path=output_path, num_rows=args.img_dims, num_cols=args.img_dims, all_states=all_states) i.generate_img()
def __init__(self, markov_chain_db: str, head_db: str, pmi_db: str, logger=None): Mod.__init__(self, logger) self.markov_chain_db = markov_chain_db self.head_db = head_db self.gen = MarkovChain(self.markov_chain_db) self.hs = HeadSelector(self.head_db, pmi_db)
def get(self): if request.args['num'] is not None: num_of_words = int(request.args['num']) else: num_of_words = 20 chain = MarkovChain('surgery.txt') sentence = [] sentence.append(chain.generate_random_sentence(num_of_words - 1)) return ' '.join(sentence)
def main(argv=None): args = parse_args() markov = MarkovChain([], args.order) samples, postprocessor = prepare_samples_and_postprocessor(args) markov.add_samples(samples) for i in range(args.count): sequence = markov.generate(args.length) print((postprocessor(sequence)))
def __init__(self, filename): self.filename = filename self.markov_chain = MarkovChain() midi = mido.MidiFile(self.filename) previous_note = 0 for track in midi.tracks: for message in track: if message.type == "note_on": current_note = (message.note)%12 self.markov_chain.add(previous_note, current_note, 0) previous_note = current_note
def main(): user = input("User handle to analyze?\n") word_list = tweets_to_list(get_all_tweets(user)) sentences = 0 chain = MarkovChain(word_list) while True: try: sentences = int(input("How many sentences to generate?\n")) break except ValueError: print("Please input a number\n") for _ in range(sentences): chain.generate_sentence()
class GenerateStrategy: """ Стратегия генерации текста """ def __init__(self, database, window_size): """ :param window_size: :type window_size: int """ self.model = MarkovChain(database, window_size) self.tokenizer = Tokenizer(Tokenizer.LoadStrategy(self.model.tokens)) self.model.set_tokenizer(self.tokenizer)
def generate_database(self, captured_text_path='captured_raw_text.txt'): p = PrepareText() with open(captured_text_path) as f: raw_text = f.readlines() print('Preparing texts') pbar = ProgressBar() prepared_texts = [p.prepare(i) for i in pbar(raw_text)] clean_texts = set(filter(lambda x: not self._drop(x) if x else False, prepared_texts)) print('Generating database') mc = MarkovChain(self.db_path, verbose=False) mc.generateDatabase('\n'.join(clean_texts), n=4, make_lowercase=True) mc.dumpdb() self.markov = mc
def test_random_walk(self): '''The sentence generated follows the Markov Chain algorithm.''' fish_list = [ "one", "fish", "two", "fish", "red", "fish", "blue", "fish" ] mark = MarkovChain(fish_list) # store a list of words generated by a random walk sentence = mark.random_walk().split() for i in range(len(sentence) - 1): word = sentence[i] word_after = sentence[i + 1] # make sure that word_after is allowed to come after the word states_that_come_next = list(mark.chain[word].keys()) assert word_after in states_that_come_next
def test_can_probabilistically_transition_to_a_state(self, mock_random): """Tests that the chain can transition to another state based on the probability matrix.""" matrix = np.array([[1 / 2, 1 / 4, 1 / 4], [1 / 2, 0, 1 / 2], [1 / 4, 1 / 4, 1 / 2]]) chain = MarkovChain(transition_matrix=matrix, states=['R', 'N', 'S'], initial_state='N') chain.step() state1 = chain.current_state chain.step() state2 = chain.current_state assert state1 == 'S' assert state2 == 'R'
def __init__(self, filename, verbose=False): """ This is the constructor for a Serializer, which will serialize a midi given the filename and generate a markov chain of the notes in the midi. """ self.filename = filename # The tempo : number of microseconds per beat. self.tempo = None # The delta time between each midi message is a number that # is a number of ticks which can be written as ticks_per_beat. self.ticks_per_beat = None self.markov_chain = MarkovChain() self._parse(verbose=verbose)
class TestMarkovModel(unittest.TestCase): def setUp(self): self.model = MarkovChain() self.lyrics = [["hello world"], ["how are you"]] self.model.fit(self.lyrics) def test_first_words(self): self.assertEqual(self.model.initial, ["hello", "how"]) def test_second_transition(self): actual_second = defaultdict(list) actual_second["hello"].append("world") actual_second["how"].append("are") self.assertEqual(self.model.second, actual_second)
def test_add_N_1(self): mc = MarkovChain() mc.add(( 'a', 'b', 'c', )) self.assertEqual( { ('a', ): { 'b': 1 }, ('b', ): { 'c': 1 }, 'START': { ('a', ): 1 }, }, mc.model) mc.add(( 'b', 'a', )) self.assertEqual( { ('a', ): { 'b': 1 }, ('b', ): { 'c': 1, 'a': 1 }, 'START': { ('a', ): 1, ('b', ): 1 }, }, mc.model) mc.add(( 'a', 'c', )) self.assertEqual( { ('a', ): { 'b': 1, 'c': 1 }, ('b', ): { 'c': 1, 'a': 1 }, 'START': { ('a', ): 2, ('b', ): 1 }, }, mc.model)
def __read_times(): """Read times between keystrokes and save them in a Markov chain.""" chain = MarkovChain() a = getch() sys.stdout.write(a) while a != '\r': start = time.time() b = getch() end = time.time() t = end - start chain.add_value(a, b, t) a = b sys.stdout.write(a) sys.stdout.write('\n') return chain
def compare_users(epsilon, verbose): """Compare two users. - verbose: Enable or disable verbose printing. Each user will in turn have to type his text. Do not hit the ENTER (or RETURN) key until you are done, as it is how the input is validated. """ USER_1 = "Bro 1" USER_2 = "Bro 2" COMPARISON = " != " MarkovChain.set_epsilon = epsilon print "Please type your texts. Hit the ENTER key once you have finished typing." reader = MarkovTimeReader() print USER_1 chain_1 = reader.read() print USER_2 chain_2 = reader.read() if MarkovChain.are_similiar(chain_1, chain_2): COMPARISON = " == " print "\n" + USER_1 + COMPARISON + USER_2 if verbose: print "\nEpsilon used: " + str(epsilon) print "Markov chains:" print USER_1 + ":" chain_1.display() print USER_2 + ":" chain_2.display()
def generate_condition_data(self): """ Predicts condition ('Sunny', 'Rain', 'Snow') for the current observation period using a simple Markov Chain model. """ # set 'Conditions' column to NA self.output['Conditions'] = 'NA' # instantiate new MarkovChain object MC = MarkovChain() # apply forecast function on 'Conditions' column based on temperature # and humidity values for each observation period params = self.output[["Temperature", "Humidity"]] self.output[['Conditions']] = params.apply( lambda x: MC.forecast_weather(x.values[0], x.values[1]), axis=1)
def test_state_name_retrieval_by_index(self): """Tests that, given a state number, the corresponding label can be retrieved.""" chain = MarkovChain(states=['R', 'N', 'S']) state = chain.states[1] assert state == 'N'
class NewsMaker: def __init__(self): trends_processor = TrendsProcessor() self.trends = trends_processor.processed_trends self.markov_chain = MarkovChain() def start(self): for t in self.trends: text = self.markov_chain.execute(t.texts) self.save(t, text) def save(self, trend, text): news = News(trend, text) news_data = None with open(constants.NEWS_JSON) as json_data: try: news_data = json.load(json_data) except Exception as e: Logger.error("Got %s on json.load('news.json')" % e) if news_data is None: news_data = [] news_data.append(news.__dict__) with open(constants.NEWS_JSON, mode='w', encoding='utf8') as json_file: data = json.dumps(news_data, ensure_ascii=False, indent=4) json_file.write(data)
def generate_markov_sentence(original_sentence): mc = MarkovChain(verbose=False) mc.generateDatabase((' '.join(get_text()))) stripped = strip_tags(original_sentence) try: seed = ' '.join(stripped.split()[0:3]) sent = mc.generateStringWithSeed(seed) except: try: seed = ' '.join(stripped.split()[0:2]) sent = mc.generateStringWithSeed(seed) except: return generate_seedless_markov_sentence() if check_blacklist(sent): return '' else: return sentence_case(sent)
def test_can_create_with_transition_matrix(self): """Tests that a transition matrix can be passed to define the chain on creation.""" matrix = np.array([[1 / 2, 1 / 4, 1 / 4], [1 / 2, 0, 1 / 2], [1 / 4, 1 / 4, 1 / 2]]) chain = MarkovChain(transition_matrix=matrix) assert np.array_equal(chain.transition_matrix, matrix)
def chain(self): """A Markov chain fixture.""" matrix = np.array([[1 / 2, 1 / 4, 1 / 4], [1 / 2, 0, 1 / 2], [1 / 4, 1 / 4, 1 / 2]]) chain = MarkovChain(transition_matrix=matrix, states=['R', 'N', 'S'], initial_state='N') return chain
def test_parse_and_add(self): mc = MarkovChain() mc.parse_and_add( 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec ornare placerat fringilla.' ) self.assertEqual( { ('Lorem', ): { 'ipsum': 1 }, ('ipsum', ): { 'dolor': 1 }, ('dolor', ): { 'sit': 1 }, ('sit', ): { 'amet,': 1 }, ('amet,', ): { 'consectetur': 1 }, ('consectetur', ): { 'adipiscing': 1 }, ('adipiscing', ): { 'elit.': 1 }, ('elit.', ): { 'Donec': 1 }, ('Donec', ): { 'ornare': 1 }, ('ornare', ): { 'placerat': 1 }, ('placerat', ): { 'fringilla.': 1 }, 'START': { ('Lorem', ): 1, ('Donec', ): 1 }, }, mc.model)
def __init__(self, sentence_tokenizer, word_tokenizer): self.sentence_tokenizer = sentence_tokenizer self.word_tokenizer = word_tokenizer self.markov_chain = MarkovChain() self.word_contexts = defaultdict(list) self.word_counts = Counter() self.word_pair_counts = Counter()
def __init__(self, database, text_path, window_size): """ :param text_path: путь к датасету :type text_path: str :param window_size: размер окна :type window_size: int """ self.model = MarkovChain(database, window_size) self.text_path = text_path
def __init__(self, filename, verbose=False, order=1): """ This is the constructor for a Serializer, which will serialize a midi given the filename and generate a markov chain of the notes in the midi. """ self.filename = filename # The tempo is number representing the number of microseconds # per beat. self.tempo = None # The delta time between each midi message is a number that # is a number of ticks, which we can convert to beats using # ticks_per_beat. self.markov_chain = MarkovChain() self.order = order self.markov_chain.order = order self._parse(verbose=verbose)
def __init__( self, markov_chain_db: str, head_db: str, pmi_db: str, logger=None ): Mod.__init__(self, logger) self.markov_chain_db = markov_chain_db self.head_db = head_db self.gen = MarkovChain(self.markov_chain_db) self.hs = HeadSelector(self.head_db, pmi_db)
def buildMarkovChain(counts): markovChain = MarkovChain() print "Counts: " + str(counts) for origin in counts.keys(): # Get sum of outgoing edges from current origin to determine denominator of probability calculation totalOutSum = 0 for count in counts[origin].values(): totalOutSum += count # Calculate transition probabilities from current origin transitionProbabilities = {} for destination, count in counts[origin].iteritems(): transitionProbabilities[destination] = count / float(totalOutSum) markovChain.addState(origin, transitionProbabilities) print "Built Markov chain:\n" + str(markovChain) return markovChain
class ModMarkovChain(Mod): def __init__( self, markov_chain_db: str, head_db: str, pmi_db: str, logger=None ): Mod.__init__(self, logger) self.markov_chain_db = markov_chain_db self.head_db = head_db self.gen = MarkovChain(self.markov_chain_db) self.hs = HeadSelector(self.head_db, pmi_db) def gen_from_sentence(self, sent, num=5): heads = self.hs.select(sent, num=num) print(heads) replies = [] for head, score in heads: query = (params.START_SYMBOL, head, ) query_cands = [] # search min_len = float("inf") min_sent = "" for i in range(10): sent = self.gen.generate(query) if len(sent) < min_len: min_sent = sent min_len = len(min_sent) query_cands.append(sent) # log for _cands in query_cands: self.logger.info("".join(_cands[1:])) if min_sent: replies.append(min_sent) return ["".join(_[1:]) for _ in replies] def can_utter(self, message, master): return True def utter(self, message, master): return [ (random.uniform(0.7, 1.0), text, "markov_chain", dict()) for text in self.gen_from_sentence( message["text"], num=3 ) ]
def main(): startStateProbabilities = {"A" : 0.25, "B" : 0.4, "C" : 0.35} markovChain = MarkovChain() markovChain.addState("A", {"A" : 0.1, "B" : 0.8, "C" : 0.1}) markovChain.addState("B", {"A" : 0.1, "B" : 0.1, "C" : 0.8}) markovChain.addState("C", {"A" : 0.8, "B" : 0.1, "C" : 0.1}) markovChain.setStartStateProbabilities(startStateProbabilities) print markovChain for i in range(0, 10): print markovChain.generateData(30) f = open("my_new_pickle.pickle", 'w') pickle.dump(markovChain, f) f.close()
def __init__(self, filename, verbose=False): """ This is the constructor for a Serializer, which will serialize a midi given the filename and generate a markov chain of the notes in the midi. """ self.filename = filename # The tempo is number representing the number of microseconds # per beat. self.tempo = None # The delta time between each midi message is a number that # is a number of ticks, which we can convert to beats using # ticks_per_beat. self.ticks_per_beat = None self.markov_chain = MarkovChain() self._parse(verbose=verbose)
def __init__(self, candidate, ): """ Prepare the bot for the input candidate.""" # Connect to the SQL database self.DB = ConnectToDB() self.corpus_table = 'corpus_table' self.question_table = 'question_table' self.response_table = 'response_table' # Save candidate and get candidate corpus self.candidate = candidate.lower() self.corpus = self.get_corpus() # Initialize the vectorizer self.TV = TokenVectorizer() # Initialize the markov chain self.sorin = MarkovChain(self.corpus) # Log dictionary for questions and responses self.idnum = 0
def main(args): if not args or len(args) > 1: print "usage: gensent.py <letters>" exit(1) letters = args.pop() m = MarkovChain(2, letters=letters) m.observe_file('texts/en.txt', True) for i in xrange(4): start = m.get_random_prestate() print m.random_walk_string(10, start)
def compare_users(): """Compare two users.""" USER_1 = "Bro 1" USER_2 = "Bro 2" COMPARISON = " != " MarkovChain.set_epsilon = 0.1 print USER_1 chain_1 = __read_times() print USER_2 chain_2 = __read_times() if MarkovChain.are_similiar(chain_1, chain_2): COMPARISON = " == " print "\n" + USER_1 + COMPARISON + USER_2 print USER_1 + ":" chain_1.display() print USER_2 + ":" chain_2.display()
class Parser: def __init__(self, filename, verbose=False): """ This is the constructor for a Serializer, which will serialize a midi given the filename and generate a markov chain of the notes in the midi. """ self.filename = filename # The tempo is number representing the number of microseconds # per beat. self.tempo = None # The delta time between each midi message is a number that # is a number of ticks, which we can convert to beats using # ticks_per_beat. self.ticks_per_beat = None self.markov_chain = MarkovChain() self._parse(verbose=verbose) def _parse(self, verbose=False): """ This function handles the reading of the midi and chunks the notes into sequenced "chords", which are inserted into the markov chain. """ midi = mido.MidiFile(self.filename) self.ticks_per_beat = midi.ticks_per_beat previous_chunk = [] current_chunk = [] for track in midi.tracks: for message in track: if verbose: print(message) if message.type == "set_tempo": self.tempo = message.tempo elif message.type == "note_on": if message.time == 0: current_chunk.append(message.note) else: self._sequence(previous_chunk, current_chunk, message.time) previous_chunk = current_chunk current_chunk = [] def _sequence(self, previous_chunk, current_chunk, duration): """ Given the previous chunk and the current chunk of notes as well as an averaged duration of the current notes, this function permutes every combination of the previous notes to the current notes and sticks them into the markov chain. """ for n1 in previous_chunk: for n2 in current_chunk: self.markov_chain.add( n1, n2, self._bucket_duration(duration)) def _bucket_duration(self, ticks): """ This method takes a tick count and converts it to a time in milliseconds, bucketing it to the nearest 250 milliseconds. """ try: ms = ((ticks / self.ticks_per_beat) * self.tempo) / 1000 return int(ms - (ms % 250) + 250) except TypeError: raise TypeError( "Could not read a tempo and ticks_per_beat from midi") def get_chain(self): return self.markov_chain
class MasterpieceWriter(object): def __init__(self, sentence_tokenizer, word_tokenizer): self.sentence_tokenizer = sentence_tokenizer self.word_tokenizer = word_tokenizer self.markov_chain = MarkovChain() self.word_contexts = defaultdict(list) self.word_counts = Counter() self.word_pair_counts = Counter() def _paragraphs_from_file(self, file_name): with open(file_name) as f: for line in f: line = line.strip() if line != "": yield line def _get_words_and_contexts(self, input_files): for file_name in input_files: for paragr in self._paragraphs_from_file(file_name): sentences = self.sentence_tokenizer.tokenize(paragr) if len(sentences) == 0: continue yield PARA_BEGIN, None for sentence in sentences: words, contexts = self.word_tokenizer.tokenize(sentence) if len(words) == 0: continue yield SENT_BEGIN, None for word in words: yield (word, None) yield SENT_END, None if contexts is not None: yield None, contexts yield PARA_END, None def train(self, training_files): prev_prev_word, prev_word = None, None for word, contexts in self._get_words_and_contexts(training_files): if contexts is not None: for ctx_key in contexts: self.word_contexts[ctx_key].extend(contexts[ctx_key]) if word is not None: # Train markov chain (need at least 3 tokens) if prev_prev_word is not None: self.markov_chain.add((prev_prev_word, prev_word), (prev_word, word)) # Collect stats if word not in ALL_SPECIAL: self.word_counts[word] += 1 if prev_word not in ALL_SPECIAL: self.word_pair_counts[(prev_word, word)] += 1 # Update prev_prev_word and prev_word prev_prev_word, prev_word = prev_word, word def stats(self, top=10): return dict(most_common_words=self.word_counts.most_common(top), most_common_word_pairs=self.word_pair_counts.most_common(top)) def generate_masterpiece(self, prng=None): yield PARA_BEGIN yield SENT_BEGIN for next in self.markov_chain.generate((PARA_BEGIN, SENT_BEGIN), prng): w1, w2 = next yield w2
class MarkovTimeReader: """Class to read times between keystrokes and return a Markov chain. - __chain: MarkovChain used for computations. - __text: List used to store input characters. - __times: List used to store times. """ def __init__(self): """Init.""" self.__chain = MarkovChain() self.__text = list() self.__times = list() def __read_character(self): """Read a character and return it along with a time.""" start = time.time() input_char = getch() end = time.time() t = end - start return input_char, t def __backspace(self): """When the input character is a backspace.""" if self.__text: sys.stdout.write('\b \b') if len(self.__text) > 1: previous = self.__text.pop() ante_previous = self.__text[-1] old_time = self.__times.pop() self.__chain.add_value(ante_previous, previous, - old_time) def __normal_character(self, input_character, interval): """When the input character is a normal character. - input_character: Input character. - interval: Time interval. """ if self.__text: previous = self.__text[-1] self.__chain.add_value(previous, input_character, interval) self.__times.append(interval) def read(self): """Read characters. Note that the object's internals are reset before reading characters. (Meaning it is possible to read countless MarkovChain using the same MarkovTimeReader.) """ # Reset the object. self.__init__() go_on = True while go_on: input_character, interval = self.__read_character() go_on = input_character != '\r' if go_on: if input_character == '\x7f': self.__backspace() else: self.__normal_character(input_character, interval) self.__text.append(input_character) sys.stdout.write(input_character) sys.stdout.write('\n') return self.__chain
class PoliBot(object): def __init__(self, candidate, ): """ Prepare the bot for the input candidate.""" # Connect to the SQL database self.DB = ConnectToDB() self.corpus_table = 'corpus_table' self.question_table = 'question_table' self.response_table = 'response_table' # Save candidate and get candidate corpus self.candidate = candidate.lower() self.corpus = self.get_corpus() # Initialize the vectorizer self.TV = TokenVectorizer() # Initialize the markov chain self.sorin = MarkovChain(self.corpus) # Log dictionary for questions and responses self.idnum = 0 def ask_question(self, question=None): ts = time.time() self.date = int(datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d')) self.time = int(datetime.datetime.fromtimestamp(ts).strftime('%H%M%S')) self.ID = str(self.idnum)+'_'+str(ts) self.idnum+=1 try: tokens = self.TV.tokenize_full(question) except: tokens = [] try: word_string = [str(t) for t in tokens] except: word_string = "" try: tokens_vect = self.TV.make_vector(word_string) except: tokens_vect=[] if len(tokens_vect) > 1: question_vect = sum(tokens_vect)/len(tokens_vect) else: question_vect = tokens_vect if len(question_vect)==1: self.question_vect = question_vect[0] else: self.question_vect = question_vect self.question_log = { 'question_id':[self.ID], 'question_date':[self.date], 'question_time':[self.time], 'question_sent':[question], 'question_tokens':[tokens]} self.response_log = { 'response_id':[self.ID], 'response_date':[self.date], 'response_time':[self.time], 'response_candidate':[self.candidate], 'response_sent':[], 'response_tokens':[], 'cosine_sim':[0], 'question_id':[self.question_log['question_id'][0]] } # We want a new response dictionary for each question asked. self.response_dict = {} self.responseIDcounter = 0 self.responseLOOPcounter = 0 def response(self, num_sent=100, tries=10, save_to_db=False): generated_sentences = self.sorin.generate_sentences(num_sent=num_sent) cosine_sims = [0] all_tokens = [] for i, sent in enumerate(generated_sentences): if sent is None: continue else: tokens = self.TV.tokenize_full(sent) if tokens is None: continue else: word_string = [str(t) for t in tokens] tokens_vect = self.TV.make_vector(word_string) if len(tokens_vect) > 1: response_vect = sum(tokens_vect)/len(tokens_vect) else: response_vect = tokens_vect # Cosine similarity try: cosine_sim_0 = cosine(response_vect,self.question_vect) except: continue if cosine_sim_0 > np.max(cosine_sims): self.response_log['response_sent'] = [sent] self.response_log['response_tokens'] = [tokens] self.response_log['cosine_sim'] = [cosine_sim_0] cosine_sims.append(cosine_sim_0) all_tokens.append(tokens) else: cosine_sims.append(cosine_sim_0) all_tokens.append(tokens) if (self.responseLOOPcounter < tries) and (self.response_log['cosine_sim'][0] < 0.70): self.responseLOOPcounter+=1 self.response(num_sent=num_sent, tries=tries) else: self.response_log['cosine_sim_dist'] = \ [(np.mean(cosine_sims),np.std(cosine_sims))] if save_to_db: self.DB.save_to_db(self.question_table, self.question_log) self.DB.save_to_db(self.response_table, self.response_log) else: print("Not saving to db") return self.response_log['response_sent'][0] def get_corpus(self): return self.DB.pull_candidate_corpus(self.corpus_table, self.candidate)
def __init__(self): """Init.""" self.__chain = MarkovChain() self.__text = list() self.__times = list()
def __init__(self, db_path='markov.db'): try: self.mc = MarkovChain(db_path, verbose=False) except: print('No database found at path. Creating new database.') self.mc = seed_db(db_path)