def markov_url(targets, dependencies): person = open("config/person.json", 'r') config = json.load(person) split = config["split"] flag = False time = 0 day = '' data = [] log = csv.reader(open("logs/out/url.csv")) for line in log: if flag: newTime = parseTime(line[1]) if day != '' and day != line[0]: data[-1].append(line[2]) day = line[0] elif newTime > split + time: data.append([line[2]]) else: data[-1].append(line[2]) time = newTime else: flag = True model = markovify.Chain(data, 3) path = model.walk() with open(targets[0], 'w') as output: for item in path: output.write(item + '\n')
def __init__(self, input_text=None, state_size=constants.DEFAULT_NGRAM_SIZE, chain=None, parsed_sentences=None): """ :param input_text: DISABLED, do not pass this. instead, pass parsed_sentences. :param ngram_size: the N in N-gram, AKA state size or window size, same as elsewhere :param chain: A trained markovify.Chain instance for this text, if pre-processed. :param parsed_sentences: A list of lists i.e. [ [word, word, ...], [word, word, ...], ... ] Assumption - these should be sentence-tokenized & word-tokenized before passing to here. in text_makers module there will be a wrapper that does just that. """ # NOTE: not calling super(); markovify.Text constructor does some things we don't want to do. # Overriding, satisfying same needs, but adapting to our purposes if input_text: raise Disabled( "disabled in this adapter; tokenize beforehand, pass to `parsed_sentences` in constructor" ) self.state_size = state_size self.parsed_sentences = parsed_sentences self.chain = chain or markovify.Chain(self.parsed_sentences, state_size) # The "rejoined_text" variable is checked in make_sentences -> test_sentence_output, which # "assesses the novelty of sentences". This is a very cool feature, but so far it depends on the # 'eager' stringification that we are trying to get away from. For now, we'll disable it. self.rejoined_text = u'<DISABLED>'
def __init__(self, do_markovify=True): print("tagging the datasets and markovifying them ... please wait!") # print(list(brown.tagged_sents())) # print(list(nps_chat.tagged_words())) # with open("reddit_apple_android.txt", "w") as text_file: # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (text_file.sents('reddit_apple_android.txt'))) self.tagged_sents = list(brown.tagged_sents()) # self.tagged_sents = list(treebank.tagged_sents()) # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-emma.txt'))) # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('quora.txt'))) # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('reddit_apple_android.txt'))) # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('hackernews.txt'))) self.tagged_sents.append(list(treebank.tagged_sents())) # self.tagged_sents.append(list(nps_chat.tagged_words())) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-emma.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('chesterton-brown.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-persuasion.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-sense.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('reddit_apple_android.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (genesis.sents('english-web.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(gutenberg.sents('austen-persuasion.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(gutenberg.sents('austen-sense.txt')))) # self.tagged_sents.append(list(nltk.pos_tag(genesis.sents('english-web.txt')))) # self.tagged_sents.append(list(genesis.tagged_words())) # self.tagged_sents.append(list(snowball_data.tagged_words())) # print(self.tagged_sents) if do_markovify: self.model = markovify.Chain(self.tagged_sents, 2)
def test_chain_update(self): chain = markovify.Chain([["foo", "bar"]], state_size=1) assert len(chain.model.keys()) == 3 assert "testing" not in chain.begin_choices chain.update([["testing", "testing"]]) assert len(chain.model.keys()) == 4 assert "testing" in chain.begin_choices
def __init__(self, do_markovify=True): """ :param do_markovify: """ self.tagged_sents = list(brown.tagged_sents()) if do_markovify: self.model = markovify.Chain(self.tagged_sents, 2)
def slovodel_config(tmpdir): path = tmpdir.mkdir("sub") file_noun = path.join("noun.json") file_verb = path.join("verb.json") file_adjective = path.join("adjective.json") file_noun.write(markovify.Chain([["абвг"]], 1).to_json()) file_verb.write(markovify.Chain([["вгде"]], 1).to_json()) file_adjective.write(markovify.Chain([["дежз"]], 1).to_json()) config = word_maker.Configuration( { word_maker.wordTypes.NOUN: file_noun, word_maker.wordTypes.VERB: file_verb, word_maker.wordTypes.ADJECTIVE: file_adjective, }, db.Configuration("dummy", "dummy", 0, 0, None), ) return config
def __init__(self, input_text, state_size=2, chain=None): """ input_text: A string. state_size: An integer, indicating the number of words in the model's state. chain: A trained markovify.Chain instance for this text, if pre-processed. """ runs = list(self.generate_corpus(input_text)) # Rejoined text lets us assess the novelty of generated setences self.rejoined_text = self.sentence_join(map(self.word_join, runs)) self.state_size = state_size self.chain = chain or markovify.Chain(runs, state_size)
def __init__(self, input_text, state_size=2, chain=None): """ input_text: A list of strings representing individual comments. state_size: An integer indicating the number of words in the model's state. chain: A trained markovify.Chain instance for this text, if pre-processed. """ if chain == None: runs = self.generate_corpus(input_text) self.input_text = input_text self.state_size = state_size self.chain = chain or markovify.Chain(runs, state_size)
def get_requests(self, appID, stripeSize, numStripes): currRequest = Request("req0", stripeSize, numStripes) currSize = stripeSize * numStripes reqList = [currRequest] if self.numPredictedRequests == 0: return reqList if appID not in self.appIDtoModelDict: self.appIDtoModelDict[appID] = { 'mean': float(currSize), 'std': 0.0, 'numSamples': 1, 'samples': [currSize] } else: self.appIDtoModelDict[appID]['numSamples'] += 1 prevMean = self.appIDtoModelDict[appID]['mean'] self.appIDtoModelDict[appID]['mean'] = ( float(self.appIDtoModelDict[appID]['numSamples'] - 1) * prevMean + float(currSize)) / self.appIDtoModelDict[appID]['numSamples'] prevVariance = math.pow(self.appIDtoModelDict[appID]['std'], 2) newVariance = ( float(self.appIDtoModelDict[appID]['numSamples'] - 1) * prevVariance + (float(currSize) - prevMean) * (float(currSize) - self.appIDtoModelDict[appID]['mean']) ) / self.appIDtoModelDict[appID]['numSamples'] self.appIDtoModelDict[appID]['std'] = math.sqrt(newVariance) self.appIDtoModelDict[appID]['samples'].append(currSize) if len(self.appIDtoModelDict[appID] ['samples']) > self.maxCorpusLength: self.appIDtoModelDict[appID][ 'samples'] = self.appIDtoModelDict[appID]['samples'][1:] if self.appIDtoModelDict[appID][ 'numSamples'] >= self.minCorpusLength: X = [ self.generate_training_samples( self.appIDtoModelDict[appID]['samples'], self.appIDtoModelDict[appID]['mean'], self.appIDtoModelDict[appID]['std']) ] model = markovify.Chain(X, self.markovOrder) predictions = self.get_predictions(model, X[0], self.numPredictedRequests) predictedStripeCounts = self.convert_predictions_to_stripe_count( predictions, self.appIDtoModelDict[appID]['mean'], self.appIDtoModelDict[appID]['std'], stripeSize) for i in range(len(predictedStripeCounts)): reqName = "req{0}".format(i + 1) req = Request(reqName, stripeSize, predictedStripeCounts[i]) reqList.append(req) return reqList
def test_entropy(self): model = { ('___BEGIN__',): {'0': 1.0, '1': 0.0 }, ('0',): { '0': 0.5, '1': 0.5 }, ('1',): { '0': 1.0, '1': 0. } } chain = markovify.Chain(None, state_size=1, model=model, finite=True) self.assertAlmostEqual(chain.entropy(('___BEGIN__',)), 0) self.assertAlmostEqual(chain.entropy(('0',)), 1.0) self.assertAlmostEqual(chain.entropy(('1',)), 0) # should have 10 zeros pw = chain.gen_entropy(10) assert 10 <= len([c for c in pw if c == '0']) <= 11
def test_select_most_frequent_follower_retrieves_unguessed_letter(self): chain = markovify.Chain([ ['a', 'b'], ['a', 'b', 'a', 'b'], ['a', 'd'], ['b', 'a', 'd'], ['a', 'c'], ], state_size=1) guesser = SingleStateMarkovGuesser(word_length=9, potential_words=['implosion']) guesser.incorrect_guesses = {'b'} guesser.alphabet = {'c'} guess = guesser._select_most_frequent_follower(chain, ('a', )) self.assertEqual(guess, 'c')
def create_chats_newlinetext(chats: List[Dict[str, Any]], state_size) -> Optional[markovify.NewlineText]: # Create a list with all messages messages: List[List[str]] = [] # For chat in the list for chat in chats: # Find the chat name name: str = chat.get("name") if name is None: name = "Unknown" # For update in the chat with click.progressbar(chat["messages"], label=name, length=len(chat["messages"]), show_percent=True, fill_char="█", empty_char="░") as updates_bar: for update in updates_bar: # Check that the update is not a service update if update["type"] != "message": continue # Check that the sender is not a bot (null?) if update.get("from") is None: continue # Find the message inside the update message: str = merge_message(update["text"]) # Skip commands if message.startswith("/"): continue # Split the message in words words: List[str] = message.split() # Append the words to the messages messages.append(words) # If the chat has no messages, return None if len(messages) == 0: return None # Create the chain from the words chain = markovify.Chain(messages, state_size=state_size) # Return the chain text = markovify.NewlineText(None, state_size=state_size, chain=chain) # Return the text return text
def __init__(self, input_text, state_size = 2, finite=False): runs = self.generate_corpus(input_text) self.chain = markovify.Chain(list(runs), state_size, finite=finite)
def __init__(self, word_length, potential_words, *args, **kwargs): super().__init__(word_length, potential_words, *args, **kwargs) self.markov_model_2 = markovify.Chain(self.potential_word_letters, state_size=2)
def splitText(text): split_text = [] for t in text: split_text.append("".join(t).split(" ")) return split_text if __name__ == "__main__": parser = ArgParser() args = parser.parse_args() fileNames = getFileNames(args.data_folder) text = splitText(loadData(args.data_folder, fileNames)) # Create a list of markov chains for combination chains = [markovify.Chain(text, state_size=args.state_size) for t in text] # Combine all chains chain = markovify.combine(chains) # Generate stories stories = [] init_state = tuple(args.init_state.split(" ")) if args.init_state else () if args.init_state and len(init_state) != args.state_size: print("Length of init_state must be equal to state_size. Received length {} and state size {}".format(len(init_state), args.state_size)) exit(1) for i in range(args.num_stories): if init_state: try: gen = [i for i in chain.gen(init_state=init_state)]
functions_duration[event['function']] = event['duration'] functions_with_rank = [ '{0}_{1}'.format(event['function'], event['rank']) for event in events ] output_file = open('data/output.csv', 'w+') output_file.write('function, rank, start_time, duration') for event in events: output_file.write('{0}, {1}, {2}, {3}\n'.format( event['function'], event['rank'], int(event['start_time'].timestamp()) * 1000000 + event['start_time'].microsecond, int(float(event['duration']) * 1000000))) output_file.close() model = markovify.Chain([functions_with_rank[20:-50]], PROCESS - 1) model.compile() tries = 0 status = [] state = ('___BEGIN__', ) * (PROCESS - 1) for i in range(LENGTH): while True: next_state = model.move(state) if next_state != '___END__': break status.append(next_state) state = tuple(state[1:]) + (next_state, ) count = 1 start_times = [0] * PROCESS function_name_index = {}
def test_bad_corpus(self): with self.assertRaises(Exception) as context: markovify.Chain(corpus="testing, testing", state_size=2)
def analyzeURLS(log): flag = False time = 0 split = 10 data = [] for line in log: if flag: newTime = parseTime(line[1]) if newTime > split + time: data.append([line[2]]) else: data[-1].append(line[2]) time = newTime else: flag = True return data data = analyzeURLS(csv.reader(open("url.log.txt"))) + analyzeURLS(csv.reader(open("url1.log.txt"))) model = markovify.Chain(data, 3) path = model.walk() code = open("arduino.txt", 'r').read() output = open("output.ino", 'w') urls = [] for item in path: urls.append("\"" + item + "\"") output.write(code.format("{" + ", ".join(urls) + "}", len(path)))
def train_HMM(corpus): """This function trains the HMM given a corpus of 'sentences'.""" MC = markovify.Chain(corpus, 5) return MC
import pandas as pd import markovify import format_lyrics song_data = pd.read_csv('rapper/data/songs_and_lyrics.csv', encoding="ISO-8859-1") song_data['lyrics'] = [ format_lyrics.format_lyrics(lyric) for lyric in song_data['lyrics'] ] song_model = markovify.Chain(song_data['lyrics'], state_size=2) with open('rapper/data/billboard_100_bigram_model.json', 'w') as model: model.write(song_model.to_json())
predicted_id = tf.math.argmax(predictions, axis=1)[-1] # print(encode(tf.math.argmax(predictions, axis=1), revoc)) start = tf.expand_dims([predicted_id], 0) result.append(revoc[predicted_id]) # result += encode(predicted_ids, revoc) return (start_string + ''.join(result)) model = build_model(len(voc), bsize)1 model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) i, more = 150, 450 model.fit(dataset, initial_epoch=i, epochs=i+more, callbacks=[checkpoint]) sbatch_model = build_model(len(voc), 1) sbatch_model.load_weights(tf.train.latest_checkpoint('chpts')) sbatch_model.build(tf.TensorShape([1, None])) print(generate_text(sbatch_model, 'заратустра сказал', 1000)) corpus = list(map(lambda x: list(x + '.'), re.split(r'[\?\!\.…]+', zar))) chain = markovify.Chain(corpus=corpus, state_size=5) print(' '.join([''.join(chain.walk()) for _ in range(100)]))
def __init__(self, word_length, potential_words, *args, **kwargs): super().__init__(word_length, potential_words, *args, **kwargs) self.potential_word_letters = [list(word) for word in potential_words] self.markov_model_1 = markovify.Chain(self.potential_word_letters, state_size=1) self.alphabet = self._derive_alphabet(self.potential_words)
mid = mido.MidiFile('song.mid') notes = [] # outport = mido.open_output() for msg in mid: print(msg) if (msg.type == 'note_on'): # outport.send(msg) notes.append((msg.note, round(msg.time, 2))) if (msg.type == 'note_off'): t = list(notes[len(notes) - 1]) t[1] = round(msg.time, 2) notes[len(notes) - 1] = tuple(t) print(notes) # outport.close() text_model = markovify.Chain([notes], state_size=4) generated = text_model.walk() print(generated) pygame.midi.init() player = pygame.midi.Output(0) player.set_instrument(0) for note, length in generated: player.note_on(note, 127) time.sleep(length * 2) player.note_off(note, 127) del player pygame.midi.quit()
def make_models(folder): markov_model = None with open(join(folder, "tag_children.txt")) as f: text = f.read() corpus = [line.split("^") for line in text.split("\n")] markov_model = markovify.Chain(corpus, 2) with open(join(folder, "structure_markov.pickle"), 'wb') as f: pickle.dump(markov_model, f) tags_parent_words_model = {} with open(join(folder, "tag_words.txt")) as f: text = f.read() tag_word_pairs = [ line.split("^", 1) for line in text.split("\n") if "^" in line ] for tag, word in tag_word_pairs: if not tags_parent_words_model.get(tag): tags_parent_words_model[tag] = {} tags_parent_words_model[tag][ word] = tags_parent_words_model[tag].get(word, 0) + 1 with open(join(folder, "tag_words.pickle"), 'wb') as f: pickle.dump(tags_parent_words_model, f) tags_only_model = {} with open(join(folder, "tags_only.txt")) as f: text = f.read() tag_word_pairs = [ line.split("^", 1) for line in text.split("\n") if "^" in line ] for tag, word in tag_word_pairs: if not tags_only_model.get(tag): tags_only_model[tag] = {} tags_only_model[tag][word] = tags_only_model[tag].get(word, 0) + 1 with open(join(folder, "tags_only.pickle"), 'wb') as f: pickle.dump(tags_only_model, f) tags_parent_words_model = {} with open(join(folder, "tags_parent_words.txt")) as f: text = f.read() tag_word_pairs = [ line.split("^", 1) for line in text.split("\n") if "^" in line ] for tag, word in tag_word_pairs: if not tags_parent_words_model.get(tag): tags_parent_words_model[tag] = {} tags_parent_words_model[tag][ word] = tags_parent_words_model[tag].get(word, 0) + 1 with open(join(folder, "tags_parent_words.pickle"), 'wb') as f: pickle.dump(tags_parent_words_model, f) tags_parent_words_lsiblings_model = {} with open(join(folder, "tags_parent_words_lsiblings.txt")) as f: text = f.read() tag_word_pairs = [ line.split("^", 1) for line in text.split("\n") if "^" in line ] for tag, word in tag_word_pairs: if not tags_parent_words_lsiblings_model.get(tag): tags_parent_words_lsiblings_model[tag] = {} tags_parent_words_lsiblings_model[tag][ word] = tags_parent_words_lsiblings_model[tag].get(word, 0) + 1 with open(join(folder, "tags_parent_words_lsiblings.pickle"), 'wb') as f: pickle.dump(tags_parent_words_lsiblings_model, f)
def build_model(pose_map: Dict[str, data.Pose], flows: List[List[str]], state_size: int) -> markovify.Chain: if not all(yogaflo.validate_flow(pose_map, flow) for flow in flows): raise ValueError("Invalid flow as input") return markovify.Chain(flows, state_size)
def combine(a, b): if not a: m = b else: m = markovify.combine([a, b]) return m def fetch_comments(story): return [comment.get('text', '') for comment in kids(story)] def munge_comment(comment): comment = re.sub('<[^>]+>', '', comment) comment = html.unescape(comment) return comment if __name__ == "__main__": stories = fetch_stories('new') corpus = [] for story in with_kids(stories): print(f"Story #{story} has comments") corpus += [munge_comment(c).split() for c in fetch_comments(story)] # corpus = [["A", "list", "of", "sentences"], ...] model = markovify.Chain(corpus, state_size=3) with open(f'hn_markov_{time.time()}.json', 'w') as f: f.write(model.to_json())