Ejemplo n.º 1
0
class MarkovOnTopic(object):
    '''
    TODO: ngrams bumped to four or five-grams
    TODO: first ngram is the technical noun topic of the tweet
    TODO: stem the topics to narrow them. or even better find a way to standardize synonyms to the same word and then stem
    TODO: db key has no hashmarks but the values do
    TODO: db key in lowercase, but values are original
    '''
    def __init__(self, db_path='markov.db'):
        try:
            self.mc = MarkovChain(db_path, verbose=False)
        except:
            print('No database found at path. Creating new database.')
            self.mc = seed_db(db_path)

    def generate_db(self, docs, filename=None):
        self.docs = docs
        pass

    def generate_topics(self):
        pass

    def generate_string(self, seed=None):
        regen = True
        while regen:
            if seed:
                gen_text = self.mc.generateStringWithSeed(seed)
            else:
                gen_text = self.mc.generateString()
            if not drop(gen_text):
                print gen_text
                regen = False
Ejemplo n.º 2
0
    def calc_markov(self, file):
        try:
            with open(file, encoding="utf-8") as f:
                propabilities = MarkovChain.get_words_propabilities(f.read())
        except FileNotFoundError:
            return BAD_FILE_ERROR

        init_word = random.choice(list(propabilities.keys()))
        mc = MarkovChain(MarkovState(init_word))
        for word, prop in propabilities.items():
            state_1 = MarkovState(word)
            for word_2, value in prop.items():
                state_2 = MarkovState(word_2)
                mc.add_probability(state_1, state_2, value)

        words_treshold = self.sentence_count * 25
        result = init_word.capitalize()
        for word in mc:
            if self.get_sentence_count(result) >= self.sentence_count:
                break
            if result.count(" ") > words_treshold:
                return BAD_FILE_ERROR
            if result.endswith("."):
                word = word.capitalize()
            result += f" {word}"

        return result
Ejemplo n.º 3
0
def main():
    """
    Executes a MarkovChain for text generation.

    Then it will be wait for user input.
    If length of user input less than `window` parameter of the chain,
    then random text will be generated, else last 3 words of the input will be
    taken as a start of generated text.
    If you want break the process, then enter `48598ee283437e810f2f0eb1cf66e217`.
    """
    chain = MarkovChain()
    # path relative to command line that executes that script.
    chain.chain = extensions.file.json.read("./src/markov-chain/generated-chains/ru/my-favorites-3-window.json")

    while True:
        start_text = input()

        if (start_text == "48598ee283437e810f2f0eb1cf66e217"):
            break

        # 3 - how many windows in the chain.
        start_text = handle_input_text(start_text, 3)

        if (start_text):
            print(chain.generate(start=start_text))
        else:
            print(chain.generate())
Ejemplo n.º 4
0
class MarkovOnTopic(object):
    '''
    TODO: ngrams bumped to four or five-grams
    TODO: first ngram is the technical noun topic of the tweet
    TODO: stem the topics to narrow them. or even better find a way to standardize synonyms to the same word and then stem
    TODO: db key has no hashmarks but the values do
    TODO: db key in lowercase, but values are original
    '''
    def __init__(self, db_path='markov.db'):
        try:
            self.mc = MarkovChain(db_path, verbose=False)
        except:
            print('No database found at path. Creating new database.')
            self.mc = seed_db(db_path)

    def generate_db(self, docs, filename=None):
        self.docs = docs
        pass

    def generate_topics(self):
        pass

    def generate_string(self, seed=None):
        regen = True
        while regen:
            if seed:
                gen_text = self.mc.generateStringWithSeed(seed)
            else:
                gen_text = self.mc.generateString()
            if not drop(gen_text):
                print gen_text
                regen = False
Ejemplo n.º 5
0
    def test_discrete_outdistr(self):
        q1 = np.array([1, 0])
        A1 = np.array([[0.9, 0.1, 0], [0, 0.9, 0.1]])
        mc = MarkovChain(q1, A1)
        pD_list = [
            DiscreteDistr(np.array([0.6, 0.3, 0.1])),
            DiscreteDistr(np.array([0.1, 0.3, 0.6]))
        ]
        hmm1 = HMM(mc, pD_list)
        n_states = hmm1.n_states
        Z = np.array([1, 3, 2])
        T = len(Z)
        pZ, _ = pD_list[0].prob(Z, pD_list)

        [alpha_hat, c] = mc.forward(pZ)
        expected_alpha_hat = np.array([[1.0000, 0.6000, 0.5625],
                                       [0, 0.4000, 0.4375]])
        np.testing.assert_array_almost_equal(alpha_hat,
                                             expected_alpha_hat,
                                             decimal=4)

        beta_hat = mc.backward(pZ, c)
        expected_beta_hat = np.array([[1.6667, 1.5873, 0],
                                      [12.8571, 14.2857, 7.9365]])
        np.testing.assert_array_almost_equal(beta_hat,
                                             expected_beta_hat,
                                             decimal=4)

        gamma = np.multiply(np.multiply(alpha_hat, beta_hat),
                            np.tile(c[0:T], (n_states, 1)))  # to check
        expected_gamma = np.array([[1.0000, 0.1429, 0], [0, 0.8571, 1.0000]])
        np.testing.assert_array_almost_equal(gamma, expected_gamma, decimal=4)
Ejemplo n.º 6
0
    def test_gauss_outdistr(self):
        p0 = np.array([1, 0])
        A = np.array([[0.9, 0.1, 0], [0, 0.9, 0.1]])
        mc = MarkovChain(p0, A)
        pD_list = []
        pD_list.append(GaussDistr(mean=np.array([0]), std=np.array([1])))
        pD_list.append(GaussDistr(mean=np.array([3]), std=np.array([2])))
        h = HMM(mc, pD_list)
        n_states = h.n_states
        x = np.array([-0.2, 2.6, 1.3])[:, np.newaxis]
        T = x.shape[0]

        pX, logS = pD_list[0].prob(x, pD_list)
        alpha_hat, c = mc.forward(pX)
        beta_hat = mc.backward(pX, c)
        logP_hmm = logprob(h, x)

        pX_exp = np.array([[1.0000, 0.0695, 1.0000], [0.1418, 1.0000, 0.8111]])
        np.testing.assert_array_almost_equal(pX, pX_exp, decimal=4)

        alpha_hat_exp = np.array([[1.0000, 0.3847, 0.4189],
                                  [0, 0.6153, 0.5811]])
        np.testing.assert_array_almost_equal(alpha_hat,
                                             alpha_hat_exp,
                                             decimal=4)

        c_exp = np.array([1.0000, 0.1625, 0.8266, 0.0581])
        np.testing.assert_array_almost_equal(c, c_exp, decimal=4)

        beta_hat_exp = np.array([[1.0000, 1.0389, 0], [8.4154, 9.3504,
                                                       2.0818]])
        np.testing.assert_array_almost_equal(beta_hat, beta_hat_exp, decimal=4)

        logP_hmm_exp = np.array([-9.1877])
        np.testing.assert_array_almost_equal(logP_hmm, logP_hmm_exp, decimal=4)
Ejemplo n.º 7
0
    def __init__(self, goal):
        self.engine = GrammarEngine('./dialogueSystem/grammar/generator.txt')
        self.p_engine = GrammarEngine(
            './dialogueSystem/grammar/polarity_response.txt')

        f_dracula = open('./dialogueSystem/dracula.txt', encoding="utf8")
        whole_dracula = f_dracula.read()
        train_dracula = whole_dracula[:int(len(whole_dracula) * 0.8)]
        train_dracula_file = open('./dialogueSystem/train_dracula.txt',
                                  "w+",
                                  encoding="utf8")
        train_dracula_file.write(train_dracula)

        self.identity_chain = MarkovChain('./dialogueSystem/train_dracula.txt',
                                          "word", 3)

        if goal == "Friend":
            f_questions = open('./dialogueSystem/questionsFriendGoal.txt',
                               encoding="utf8")
        else:  # User
            f_questions = open('./dialogueSystem/questionsUserGoal.txt',
                               encoding="utf8")
        self.questions = f_questions.read().splitlines()
        # print(self.questions)
        self.asked_questions = []
        self.prev_count = -1

        self.model = DialogTag('distilbert-base-uncased')  # dialogue tags
Ejemplo n.º 8
0
    def __init__(self, ml, subreddit, location):
        """Initialize the Markov Chain and writer

        ml:
            MarkovChain's max_links
        subreddit:
            The subreddit to use as the text source
        location:
            "posts"|"comments" - Whether to get the text from the top posts in
            the subreddit (faster), or from the children comments of the top
            posts in the subreddit (can return more text).
        sources:
            An array of strings for the MC
        """
        rr = RedditReader(subreddit)
        if location == "posts":
            texts = rr.get_many_post_bodies()
        elif location == "comments":
            texts = rr.get_many_comment_bodies()
        else:
            raise TypeError('`location` must be either "posts" or "comments"')

        self.mc = MarkovChain(ml)
        for text in texts:
            self.mc.add_text(text)

        self.w = Writer(self.mc)
Ejemplo n.º 9
0
def make_leftright_hmm(n_states, pD, obs_data, l_data=None):
    """
    Initialize and train a Hidden Markov Model to conform with a given set of training data sequence.
    Input:
    ------
    n_states: Desired number of HMM states.
    pD: a single object of some probability-distribution class
    obs_data: [n_samples, n_features]. The concatenated training sequences. One sample of observed data vector is stored row-wise.
    l_data: [n_sequence, ]. l_data[r] is the length of rth training sequence.
    Return:
    hmm: the trained left-right hmm object
    """
    if n_states <= 0:
        raise ValueError("Number of states must be >0")
    if l_data is None:
        l_data = [obs_data.shape[0]] # Just one single sequence
    # Make left-right Markov Chain with finite duration
    D = np.mean(l_data) / n_states # average state duration
    mc = MarkovChain()
    mc.init_left_right(n_states, D)
    hmm = HMM(mc, pD)
    hmm.init_leftright_outputdistr(obs_data, l_data) # crude initialize hmm.output_distr
    # standard training
    hmm.train(obs_data, l_data, 5, np.log(1.01))
    return hmm
Ejemplo n.º 10
0
def generate_seedless_markov_sentence():
    mc = MarkovChain(verbose=False)
    mc.generateDatabase((' '.join(get_text())))
    sent = mc.generateString()
    if check_blacklist(sent):
        return ''
    else:
        return sentence_case(sent)
Ejemplo n.º 11
0
def generate_topic_markov_sentence(texts, index):
    topics = get_topics(texts, index)
    mc = MarkovChain(verbose=False)
    mc.generateDatabase((' '.join(get_text())))
    sent = mc.generateStringWithTopics(topics)
    if check_blacklist(sent):
        return ''
    else:
        return sentence_case(sent)
Ejemplo n.º 12
0
        def __init__(self, database, window_size):
            """

            :param window_size:
            :type window_size: int
            """
            self.model = MarkovChain(database, window_size)
            self.tokenizer = Tokenizer(Tokenizer.LoadStrategy(self.model.tokens))
            self.model.set_tokenizer(self.tokenizer)
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-i', '--input_dir', action='store', type=str, default=INPUT_DIR,
                        help=f'Specify name of input image directory --in_path=path. Default: {INPUT_DIR}',
                        dest='input_dir')
    parser.add_argument('-od', '--output_dir', action='store', type=str, default=OUTPUT_DIR,
                        help=f'Specify name of output directory. Default: {OUTPUT_DIR}',
                        dest='output_dir')
    parser.add_argument('-o', '--output_file', action='store', type=str, default=OUTPUT_FILENAME,
                        help=f'Specify name of output file. Default: {OUTPUT_FILENAME}',
                        dest='output_file')
    parser.add_argument('-d', '--img_dim', action='store', type=int, default=IMG_DIMS,
                        help=f'Specify dimensions of square image to be created. Default: {IMG_DIMS}',
                        dest='img_dims')

    args = parser.parse_args()

    sequence_length = args.img_dims ** 2

    # current working directory
    cwd = os.getcwd()

    # fix problem from running from different directory
    if "scripts" in cwd:
        cwd = cwd.replace("scripts", "")

    dog_names = DOG_NAMES

    # probability of the first pixel (upper left corner) is each dog
    prior = PRIOR

    # transition matrix
    transition = [ROSIE_TRANSITION, CALLIE_TRANSITION, VENUS_TRANSITION, BEAR_TRANSITION, JAMIE_TRANSITION,
                  COOPER_TRANSITION, WINSTON_TRANSITION, BRUNO_TRANSITION, MAISY_TRANSITION, SPEEDY_TRANSITION,
                  BELLA_TRANSITION, BOOMER_TRANSITION, SASHA_TRANSITION]

    all_states = []

    # create State object for each dog in dog names
    for i, dog in enumerate(dog_names):
        new_state = State(id=i, name=dog)
        all_states.append(new_state)

    # run markov chain using states, prior probability vector, and transition matrix
    m = MarkovChain(states=all_states, prior=prior, transition=transition)
    sequence = m.run(sequence_length=sequence_length)

    # create output_path variable
    output_path = os.path.join(cwd, args.output_dir)
    output_path = os.path.join(output_path, args.output_file)

    # generate image from markov chain, input images, and output_path
    i = ImgGenerator(order=sequence, input_dir=os.path.join(cwd, args.input_dir), output_path=output_path,
                     num_rows=args.img_dims,
                     num_cols=args.img_dims, all_states=all_states)
    i.generate_img()
Ejemplo n.º 14
0
 def __init__(self,
              markov_chain_db: str,
              head_db: str,
              pmi_db: str,
              logger=None):
     Mod.__init__(self, logger)
     self.markov_chain_db = markov_chain_db
     self.head_db = head_db
     self.gen = MarkovChain(self.markov_chain_db)
     self.hs = HeadSelector(self.head_db, pmi_db)
Ejemplo n.º 15
0
    def get(self):
        if request.args['num'] is not None:
            num_of_words = int(request.args['num'])
        else:
            num_of_words = 20
        chain = MarkovChain('surgery.txt')
        sentence = []

        sentence.append(chain.generate_random_sentence(num_of_words - 1))

        return ' '.join(sentence)
Ejemplo n.º 16
0
def main(argv=None):
    args = parse_args()

    markov = MarkovChain([], args.order)

    samples, postprocessor = prepare_samples_and_postprocessor(args)
    markov.add_samples(samples)

    for i in range(args.count):
        sequence = markov.generate(args.length)
        print((postprocessor(sequence)))
Ejemplo n.º 17
0
 def __init__(self, filename):
     self.filename = filename
     self.markov_chain = MarkovChain()
     
     midi = mido.MidiFile(self.filename)
     previous_note = 0
     for track in midi.tracks:
         for message in track:
            if message.type == "note_on":
                current_note = (message.note)%12
                self.markov_chain.add(previous_note, current_note, 0)
                previous_note = current_note
Ejemplo n.º 18
0
def main():
    user = input("User handle to analyze?\n")
    word_list = tweets_to_list(get_all_tweets(user))
    sentences = 0
    chain = MarkovChain(word_list)
    while True:
        try:
            sentences = int(input("How many sentences to generate?\n"))
            break
        except ValueError:
            print("Please input a number\n")
    for _ in range(sentences):
        chain.generate_sentence()
Ejemplo n.º 19
0
    class GenerateStrategy:
        """
        Стратегия генерации текста
        """
        def __init__(self, database, window_size):
            """

            :param window_size:
            :type window_size: int
            """
            self.model = MarkovChain(database, window_size)
            self.tokenizer = Tokenizer(Tokenizer.LoadStrategy(self.model.tokens))
            self.model.set_tokenizer(self.tokenizer)
Ejemplo n.º 20
0
 def generate_database(self, captured_text_path='captured_raw_text.txt'):
     p = PrepareText()
     with open(captured_text_path) as f:
         raw_text = f.readlines()
     print('Preparing texts')
     pbar = ProgressBar()
     prepared_texts = [p.prepare(i) for i in pbar(raw_text)]
     clean_texts = set(filter(lambda x: not self._drop(x) if x else False, prepared_texts))
     print('Generating database')
     mc = MarkovChain(self.db_path, verbose=False)
     mc.generateDatabase('\n'.join(clean_texts), n=4, make_lowercase=True)
     mc.dumpdb()
     self.markov = mc
 def test_random_walk(self):
     '''The sentence generated follows the Markov Chain algorithm.'''
     fish_list = [
         "one", "fish", "two", "fish", "red", "fish", "blue", "fish"
     ]
     mark = MarkovChain(fish_list)
     # store a list of words generated by a random walk
     sentence = mark.random_walk().split()
     for i in range(len(sentence) - 1):
         word = sentence[i]
         word_after = sentence[i + 1]
         # make sure that word_after is allowed to come after the word
         states_that_come_next = list(mark.chain[word].keys())
         assert word_after in states_that_come_next
Ejemplo n.º 22
0
    def test_can_probabilistically_transition_to_a_state(self, mock_random):
        """Tests that the chain can transition to another state based on the probability matrix."""
        matrix = np.array([[1 / 2, 1 / 4, 1 / 4],
                           [1 / 2, 0, 1 / 2],
                           [1 / 4, 1 / 4, 1 / 2]])
        chain = MarkovChain(transition_matrix=matrix, states=['R', 'N', 'S'], initial_state='N')

        chain.step()
        state1 = chain.current_state
        chain.step()
        state2 = chain.current_state

        assert state1 == 'S'
        assert state2 == 'R'
Ejemplo n.º 23
0
 def __init__(self, filename, verbose=False):
     """
     This is the constructor for a Serializer, which will serialize
     a midi given the filename and generate a markov chain of the
     notes in the midi.
     """
     self.filename = filename
     # The tempo : number of microseconds per beat.
     self.tempo = None
     # The delta time between each midi message is a number that
     # is a number of ticks which can be written as ticks_per_beat.
     self.ticks_per_beat = None
     self.markov_chain = MarkovChain()
     self._parse(verbose=verbose)
Ejemplo n.º 24
0
class TestMarkovModel(unittest.TestCase):
    def setUp(self):
        self.model = MarkovChain()
        self.lyrics = [["hello world"], ["how are you"]]
        self.model.fit(self.lyrics)

    def test_first_words(self):
        self.assertEqual(self.model.initial, ["hello", "how"])

    def test_second_transition(self):
        actual_second = defaultdict(list)
        actual_second["hello"].append("world")
        actual_second["how"].append("are")
        self.assertEqual(self.model.second, actual_second)
Ejemplo n.º 25
0
    def test_add_N_1(self):
        mc = MarkovChain()
        mc.add((
            'a',
            'b',
            'c',
        ))
        self.assertEqual(
            {
                ('a', ): {
                    'b': 1
                },
                ('b', ): {
                    'c': 1
                },
                'START': {
                    ('a', ): 1
                },
            }, mc.model)

        mc.add((
            'b',
            'a',
        ))
        self.assertEqual(
            {
                ('a', ): {
                    'b': 1
                },
                ('b', ): {
                    'c': 1,
                    'a': 1
                },
                'START': {
                    ('a', ): 1,
                    ('b', ): 1
                },
            }, mc.model)

        mc.add((
            'a',
            'c',
        ))
        self.assertEqual(
            {
                ('a', ): {
                    'b': 1,
                    'c': 1
                },
                ('b', ): {
                    'c': 1,
                    'a': 1
                },
                'START': {
                    ('a', ): 2,
                    ('b', ): 1
                },
            }, mc.model)
Ejemplo n.º 26
0
def __read_times():
    """Read times between keystrokes and save them in a Markov chain."""
    chain = MarkovChain()
    a = getch()
    sys.stdout.write(a)
    while a != '\r':
        start = time.time()
        b = getch()
        end = time.time()
        t = end - start
        chain.add_value(a, b, t)
        a = b
        sys.stdout.write(a)
    sys.stdout.write('\n')

    return chain
Ejemplo n.º 27
0
def compare_users(epsilon, verbose):
    """Compare two users.
    - verbose:
        Enable or disable verbose printing.
    Each user will in turn have to type his text. Do not hit the ENTER (or
    RETURN) key until you are done, as it is how the input is validated.
    """
    USER_1 = "Bro 1"
    USER_2 = "Bro 2"
    COMPARISON = " != "

    MarkovChain.set_epsilon = epsilon

    print "Please type your texts. Hit the ENTER key once you have finished typing."
    reader = MarkovTimeReader()
    print USER_1
    chain_1 = reader.read()
    print USER_2
    chain_2 = reader.read()
    if MarkovChain.are_similiar(chain_1, chain_2):
        COMPARISON = " == "

    print "\n" + USER_1 + COMPARISON + USER_2

    if verbose:
        print "\nEpsilon used: " + str(epsilon)
        print "Markov chains:"
        print USER_1 + ":"
        chain_1.display()
        print USER_2 + ":"
        chain_2.display()
    def generate_condition_data(self):
        """ Predicts condition ('Sunny', 'Rain', 'Snow') for the current
        observation period using a simple Markov Chain model.

        """
        # set 'Conditions' column to NA
        self.output['Conditions'] = 'NA'

        # instantiate new MarkovChain object
        MC = MarkovChain()

        # apply forecast function on 'Conditions' column based on temperature
        # and humidity values for each observation period
        params = self.output[["Temperature", "Humidity"]]
        self.output[['Conditions']] = params.apply(
            lambda x: MC.forecast_weather(x.values[0], x.values[1]), axis=1)
Ejemplo n.º 29
0
    def test_state_name_retrieval_by_index(self):
        """Tests that, given a state number, the corresponding label can be retrieved."""
        chain = MarkovChain(states=['R', 'N', 'S'])

        state = chain.states[1]

        assert state == 'N'
Ejemplo n.º 30
0
class NewsMaker:
    def __init__(self):
        trends_processor = TrendsProcessor()
        self.trends = trends_processor.processed_trends
        self.markov_chain = MarkovChain()

    def start(self):
        for t in self.trends:
            text = self.markov_chain.execute(t.texts)
            self.save(t, text)

    def save(self, trend, text):
        news = News(trend, text)

        news_data = None
        with open(constants.NEWS_JSON) as json_data:
            try:
                news_data = json.load(json_data)
            except Exception as e:
                Logger.error("Got %s on json.load('news.json')" % e)

        if news_data is None:
            news_data = []

        news_data.append(news.__dict__)
        with open(constants.NEWS_JSON, mode='w', encoding='utf8') as json_file:
            data = json.dumps(news_data, ensure_ascii=False, indent=4)
            json_file.write(data)
Ejemplo n.º 31
0
def generate_markov_sentence(original_sentence):
    mc = MarkovChain(verbose=False)
    mc.generateDatabase((' '.join(get_text())))
    stripped = strip_tags(original_sentence)
    try:
        seed = ' '.join(stripped.split()[0:3])
        sent = mc.generateStringWithSeed(seed)
    except:
        try:
            seed = ' '.join(stripped.split()[0:2])
            sent = mc.generateStringWithSeed(seed)
        except:
            return generate_seedless_markov_sentence()
    if check_blacklist(sent):
        return ''
    else:
        return sentence_case(sent)
Ejemplo n.º 32
0
    def test_can_create_with_transition_matrix(self):
        """Tests that a transition matrix can be passed to define the chain on creation."""
        matrix = np.array([[1 / 2, 1 / 4, 1 / 4], [1 / 2, 0, 1 / 2],
                           [1 / 4, 1 / 4, 1 / 2]])

        chain = MarkovChain(transition_matrix=matrix)

        assert np.array_equal(chain.transition_matrix, matrix)
Ejemplo n.º 33
0
 def chain(self):
     """A Markov chain fixture."""
     matrix = np.array([[1 / 2, 1 / 4, 1 / 4], [1 / 2, 0, 1 / 2],
                        [1 / 4, 1 / 4, 1 / 2]])
     chain = MarkovChain(transition_matrix=matrix,
                         states=['R', 'N', 'S'],
                         initial_state='N')
     return chain
Ejemplo n.º 34
0
 def test_parse_and_add(self):
     mc = MarkovChain()
     mc.parse_and_add(
         'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec ornare placerat fringilla.'
     )
     self.assertEqual(
         {
             ('Lorem', ): {
                 'ipsum': 1
             },
             ('ipsum', ): {
                 'dolor': 1
             },
             ('dolor', ): {
                 'sit': 1
             },
             ('sit', ): {
                 'amet,': 1
             },
             ('amet,', ): {
                 'consectetur': 1
             },
             ('consectetur', ): {
                 'adipiscing': 1
             },
             ('adipiscing', ): {
                 'elit.': 1
             },
             ('elit.', ): {
                 'Donec': 1
             },
             ('Donec', ): {
                 'ornare': 1
             },
             ('ornare', ): {
                 'placerat': 1
             },
             ('placerat', ): {
                 'fringilla.': 1
             },
             'START': {
                 ('Lorem', ): 1,
                 ('Donec', ): 1
             },
         }, mc.model)
Ejemplo n.º 35
0
    def __init__(self, sentence_tokenizer, word_tokenizer):
        self.sentence_tokenizer = sentence_tokenizer
        self.word_tokenizer = word_tokenizer

        self.markov_chain = MarkovChain()
        self.word_contexts = defaultdict(list)

        self.word_counts = Counter()
        self.word_pair_counts = Counter()
Ejemplo n.º 36
0
        def __init__(self, database, text_path, window_size):
            """

            :param text_path: путь к датасету
            :type text_path: str
            :param window_size: размер окна
            :type window_size: int
            """
            self.model = MarkovChain(database, window_size)
            self.text_path = text_path
Ejemplo n.º 37
0
    def __init__(self, filename, verbose=False, order=1):
        """
        This is the constructor for a Serializer, which will serialize
        a midi given the filename and generate a markov chain of the
        notes in the midi.
        """
        self.filename = filename
        # The tempo is number representing the number of microseconds
        # per beat.
        self.tempo = None
        # The delta time between each midi message is a number that
        # is a number of ticks, which we can convert to beats using
        # ticks_per_beat.
        self.markov_chain = MarkovChain()

        self.order = order
        self.markov_chain.order = order

        self._parse(verbose=verbose)
Ejemplo n.º 38
0
 def __init__(
         self,
         markov_chain_db: str,
         head_db: str,
         pmi_db: str,
         logger=None
 ):
     Mod.__init__(self, logger)
     self.markov_chain_db = markov_chain_db
     self.head_db = head_db
     self.gen = MarkovChain(self.markov_chain_db)
     self.hs = HeadSelector(self.head_db, pmi_db)
def buildMarkovChain(counts):
    markovChain = MarkovChain()

    print "Counts: " + str(counts)

    for origin in counts.keys():
 
        # Get sum of outgoing edges from current origin to determine denominator of probability calculation
        totalOutSum = 0
        for count in counts[origin].values():
            totalOutSum += count

        # Calculate transition probabilities from current origin
        transitionProbabilities = {}
        for destination, count in counts[origin].iteritems():
            transitionProbabilities[destination] = count / float(totalOutSum)

        markovChain.addState(origin, transitionProbabilities)
    
    print "Built Markov chain:\n" + str(markovChain)
    return markovChain    
Ejemplo n.º 40
0
class ModMarkovChain(Mod):
    def __init__(
            self,
            markov_chain_db: str,
            head_db: str,
            pmi_db: str,
            logger=None
    ):
        Mod.__init__(self, logger)
        self.markov_chain_db = markov_chain_db
        self.head_db = head_db
        self.gen = MarkovChain(self.markov_chain_db)
        self.hs = HeadSelector(self.head_db, pmi_db)

    def gen_from_sentence(self, sent, num=5):
        heads = self.hs.select(sent, num=num)
        print(heads)
        replies = []
        for head, score in heads:
            query = (params.START_SYMBOL, head, )
            query_cands = []

            # search
            min_len = float("inf")
            min_sent = ""
            for i in range(10):
                sent = self.gen.generate(query)
                if len(sent) < min_len:
                    min_sent = sent
                    min_len = len(min_sent)
                query_cands.append(sent)
            # log
            for _cands in query_cands:
                self.logger.info("".join(_cands[1:]))
            if min_sent:
                replies.append(min_sent)

        return ["".join(_[1:]) for _ in replies]

    def can_utter(self, message, master):
        return True

    def utter(self, message, master):
        return [
            (random.uniform(0.7, 1.0),
             text, "markov_chain", dict())
            for text in self.gen_from_sentence(
                    message["text"],
                    num=3
            )
        ]
def main():
    startStateProbabilities = {"A" : 0.25, "B" : 0.4, "C" : 0.35}

    markovChain = MarkovChain()
    
    markovChain.addState("A", {"A" : 0.1, "B" : 0.8, "C" : 0.1})
    markovChain.addState("B", {"A" : 0.1, "B" : 0.1, "C" : 0.8})
    markovChain.addState("C", {"A" : 0.8, "B" : 0.1, "C" : 0.1})
    
    markovChain.setStartStateProbabilities(startStateProbabilities)

    print markovChain

    for i in range(0, 10):
        print markovChain.generateData(30)

    f = open("my_new_pickle.pickle", 'w')
    pickle.dump(markovChain, f)
    f.close()
Ejemplo n.º 42
0
 def __init__(self, filename, verbose=False):
     """
     This is the constructor for a Serializer, which will serialize
     a midi given the filename and generate a markov chain of the
     notes in the midi.
     """
     self.filename = filename
     # The tempo is number representing the number of microseconds
     # per beat.
     self.tempo = None
     # The delta time between each midi message is a number that
     # is a number of ticks, which we can convert to beats using
     # ticks_per_beat.
     self.ticks_per_beat = None
     self.markov_chain = MarkovChain()
     self._parse(verbose=verbose)
Ejemplo n.º 43
0
    def __init__(self, candidate, ):
        """ Prepare the bot for the input candidate."""
        # Connect to the SQL database
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.question_table = 'question_table'
        self.response_table = 'response_table'

        # Save candidate and get candidate corpus
        self.candidate = candidate.lower()
        self.corpus = self.get_corpus()

        # Initialize the vectorizer
        self.TV = TokenVectorizer()
        # Initialize the markov chain
        self.sorin = MarkovChain(self.corpus)

        # Log dictionary for questions and responses
        self.idnum = 0
Ejemplo n.º 44
0
def main(args):

    if not args or len(args) > 1:
        print "usage: gensent.py <letters>"
        exit(1)

    letters = args.pop()

    m = MarkovChain(2, letters=letters)
    m.observe_file('texts/en.txt', True)

    for i in xrange(4):
        start = m.get_random_prestate()
        print m.random_walk_string(10, start)
Ejemplo n.º 45
0
def compare_users():
    """Compare two users."""
    USER_1 = "Bro 1"
    USER_2 = "Bro 2"
    COMPARISON = " != "

    MarkovChain.set_epsilon = 0.1

    print USER_1
    chain_1 = __read_times()
    print USER_2
    chain_2 = __read_times()
    if MarkovChain.are_similiar(chain_1, chain_2):
        COMPARISON = " == "

    print "\n" + USER_1 + COMPARISON + USER_2
    print USER_1 + ":"
    chain_1.display()
    print USER_2 + ":"
    chain_2.display()
Ejemplo n.º 46
0
class Parser:

    def __init__(self, filename, verbose=False):
        """
        This is the constructor for a Serializer, which will serialize
        a midi given the filename and generate a markov chain of the
        notes in the midi.
        """
        self.filename = filename
        # The tempo is number representing the number of microseconds
        # per beat.
        self.tempo = None
        # The delta time between each midi message is a number that
        # is a number of ticks, which we can convert to beats using
        # ticks_per_beat.
        self.ticks_per_beat = None
        self.markov_chain = MarkovChain()
        self._parse(verbose=verbose)

    def _parse(self, verbose=False):
        """
        This function handles the reading of the midi and chunks the
        notes into sequenced "chords", which are inserted into the
        markov chain.
        """
        midi = mido.MidiFile(self.filename)
        self.ticks_per_beat = midi.ticks_per_beat
        previous_chunk = []
        current_chunk = []
        for track in midi.tracks:
            for message in track:
                if verbose:
                    print(message)
                if message.type == "set_tempo":
                    self.tempo = message.tempo
                elif message.type == "note_on":
                    if message.time == 0:
                        current_chunk.append(message.note)
                    else:
                        self._sequence(previous_chunk,
                                       current_chunk,
                                       message.time)
                        previous_chunk = current_chunk
                        current_chunk = []

    def _sequence(self, previous_chunk, current_chunk, duration):
        """
        Given the previous chunk and the current chunk of notes as well
        as an averaged duration of the current notes, this function
        permutes every combination of the previous notes to the current
        notes and sticks them into the markov chain.
        """
        for n1 in previous_chunk:
            for n2 in current_chunk:
                self.markov_chain.add(
                    n1, n2, self._bucket_duration(duration))

    def _bucket_duration(self, ticks):
        """
        This method takes a tick count and converts it to a time in
        milliseconds, bucketing it to the nearest 250 milliseconds.
        """
        try:
            ms = ((ticks / self.ticks_per_beat) * self.tempo) / 1000
            return int(ms - (ms % 250) + 250)
        except TypeError:
            raise TypeError(
                "Could not read a tempo and ticks_per_beat from midi")

    def get_chain(self):
        return self.markov_chain
Ejemplo n.º 47
0
class MasterpieceWriter(object):
    def __init__(self, sentence_tokenizer, word_tokenizer):
        self.sentence_tokenizer = sentence_tokenizer
        self.word_tokenizer = word_tokenizer

        self.markov_chain = MarkovChain()
        self.word_contexts = defaultdict(list)

        self.word_counts = Counter()
        self.word_pair_counts = Counter()

    def _paragraphs_from_file(self, file_name):
        with open(file_name) as f:
            for line in f:
                line = line.strip()
                if line != "":
                    yield line

    def _get_words_and_contexts(self, input_files):
        for file_name in input_files:
            for paragr in self._paragraphs_from_file(file_name):
                sentences = self.sentence_tokenizer.tokenize(paragr)
                if len(sentences) == 0:
                    continue

                yield PARA_BEGIN, None
                for sentence in sentences:
                    words, contexts = self.word_tokenizer.tokenize(sentence)
                    if len(words) == 0:
                        continue

                    yield SENT_BEGIN, None
                    for word in words:
                        yield (word, None)
                    yield SENT_END, None

                    if contexts is not None:
                        yield None, contexts

                yield PARA_END, None

    def train(self, training_files):
        prev_prev_word, prev_word = None, None
        for word, contexts in self._get_words_and_contexts(training_files):
            if contexts is not None:
                for ctx_key in contexts:
                    self.word_contexts[ctx_key].extend(contexts[ctx_key])

            if word is not None:
                # Train markov chain (need at least 3 tokens)
                if prev_prev_word is not None:
                    self.markov_chain.add((prev_prev_word, prev_word),
                                          (prev_word, word))
                # Collect stats
                if word not in ALL_SPECIAL:
                    self.word_counts[word] += 1
                    if prev_word not in ALL_SPECIAL:
                        self.word_pair_counts[(prev_word, word)] += 1

                # Update prev_prev_word and prev_word
                prev_prev_word, prev_word = prev_word, word

    def stats(self, top=10):
        return dict(most_common_words=self.word_counts.most_common(top),
                    most_common_word_pairs=self.word_pair_counts.most_common(top))

    def generate_masterpiece(self, prng=None):
        yield PARA_BEGIN
        yield SENT_BEGIN
        for next in self.markov_chain.generate((PARA_BEGIN, SENT_BEGIN), prng):
            w1, w2 = next
            yield w2
Ejemplo n.º 48
0
class MarkovTimeReader:
    """Class to read times between keystrokes and return a Markov chain.

    - __chain:
        MarkovChain used for computations.
    - __text:
        List used to store input characters.
    - __times:
        List used to store times.
    """

    def __init__(self):
        """Init."""
        self.__chain = MarkovChain()
        self.__text = list()
        self.__times = list()

    def __read_character(self):
        """Read a character and return it along with a time."""
        start = time.time()
        input_char = getch()
        end = time.time()
        t = end - start

        return input_char, t

    def __backspace(self):
        """When the input character is a backspace."""
        if self.__text:
            sys.stdout.write('\b \b')
            if len(self.__text) > 1:
                previous = self.__text.pop()
                ante_previous = self.__text[-1]
                old_time = self.__times.pop()
                self.__chain.add_value(ante_previous, previous, - old_time)

    def __normal_character(self, input_character, interval):
        """When the input character is a normal character.
        - input_character:
            Input character.
        - interval:
            Time interval.
        """
        if self.__text:
            previous = self.__text[-1]
            self.__chain.add_value(previous, input_character, interval)
            self.__times.append(interval)

    def read(self):
        """Read characters.
        Note that the object's internals are reset before reading characters.
        (Meaning it is possible to read countless MarkovChain using the same
        MarkovTimeReader.)
        """
        # Reset the object.
        self.__init__()

        go_on = True
        while go_on:
            input_character, interval = self.__read_character()
            go_on = input_character != '\r'
            if go_on:
                if input_character == '\x7f':
                    self.__backspace()
                else:
                    self.__normal_character(input_character, interval)
                    self.__text.append(input_character)
                    sys.stdout.write(input_character)

        sys.stdout.write('\n')

        return self.__chain
Ejemplo n.º 49
0
class PoliBot(object):
    def __init__(self, candidate, ):
        """ Prepare the bot for the input candidate."""
        # Connect to the SQL database
        self.DB = ConnectToDB()
        self.corpus_table = 'corpus_table'
        self.question_table = 'question_table'
        self.response_table = 'response_table'

        # Save candidate and get candidate corpus
        self.candidate = candidate.lower()
        self.corpus = self.get_corpus()

        # Initialize the vectorizer
        self.TV = TokenVectorizer()
        # Initialize the markov chain
        self.sorin = MarkovChain(self.corpus)

        # Log dictionary for questions and responses
        self.idnum = 0

    def ask_question(self, question=None):
        ts = time.time()
        self.date = int(datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d'))
        self.time = int(datetime.datetime.fromtimestamp(ts).strftime('%H%M%S'))

        self.ID = str(self.idnum)+'_'+str(ts)
        self.idnum+=1

        try:
            tokens = self.TV.tokenize_full(question)
        except:
            tokens = []

        try:
            word_string = [str(t) for t in tokens]
        except:
            word_string = ""

        try:
            tokens_vect = self.TV.make_vector(word_string)
        except:
            tokens_vect=[]

        if len(tokens_vect) > 1:
            question_vect = sum(tokens_vect)/len(tokens_vect)
        else:
            question_vect = tokens_vect

        if len(question_vect)==1:
            self.question_vect = question_vect[0]
        else:
            self.question_vect = question_vect

        self.question_log = {
                    'question_id':[self.ID],
                    'question_date':[self.date],
                    'question_time':[self.time],
                    'question_sent':[question],
                    'question_tokens':[tokens]}

        self.response_log = {
                    'response_id':[self.ID],
                    'response_date':[self.date],
                    'response_time':[self.time],
                    'response_candidate':[self.candidate],
                    'response_sent':[],
                    'response_tokens':[],
                    'cosine_sim':[0],
                    'question_id':[self.question_log['question_id'][0]]
                    }

        # We want a new response dictionary for each question asked.
        self.response_dict = {}
        self.responseIDcounter = 0
        self.responseLOOPcounter = 0

    def response(self, num_sent=100, tries=10, save_to_db=False):
        generated_sentences = self.sorin.generate_sentences(num_sent=num_sent)

        cosine_sims = [0]
        all_tokens = []
        for i, sent in enumerate(generated_sentences):
            if sent is None:
                continue
            else:
                tokens = self.TV.tokenize_full(sent)
                if tokens is None:
                    continue
                else:
                    word_string = [str(t) for t in tokens]
                    tokens_vect = self.TV.make_vector(word_string)

                if len(tokens_vect) > 1:
                    response_vect = sum(tokens_vect)/len(tokens_vect)
                else:
                    response_vect = tokens_vect

            # Cosine similarity
            try:
                cosine_sim_0 = cosine(response_vect,self.question_vect)
            except:
                continue

            if cosine_sim_0 > np.max(cosine_sims):
                self.response_log['response_sent'] = [sent]
                self.response_log['response_tokens'] = [tokens]
                self.response_log['cosine_sim'] = [cosine_sim_0]

                cosine_sims.append(cosine_sim_0)
                all_tokens.append(tokens)
            else:
                cosine_sims.append(cosine_sim_0)
                all_tokens.append(tokens)

        if (self.responseLOOPcounter < tries) and (self.response_log['cosine_sim'][0] < 0.70):
            self.responseLOOPcounter+=1
            self.response(num_sent=num_sent, tries=tries)
        else:
            self.response_log['cosine_sim_dist'] = \
                    [(np.mean(cosine_sims),np.std(cosine_sims))]

            if save_to_db:
                self.DB.save_to_db(self.question_table, self.question_log)
                self.DB.save_to_db(self.response_table, self.response_log)
            else:
                print("Not saving to db")

        return self.response_log['response_sent'][0]

    def get_corpus(self):

        return self.DB.pull_candidate_corpus(self.corpus_table, self.candidate)
Ejemplo n.º 50
0
 def __init__(self):
     """Init."""
     self.__chain = MarkovChain()
     self.__text = list()
     self.__times = list()
Ejemplo n.º 51
0
 def __init__(self, db_path='markov.db'):
     try:
         self.mc = MarkovChain(db_path, verbose=False)
     except:
         print('No database found at path. Creating new database.')
         self.mc = seed_db(db_path)