def train(input_file: str): df = load_data(input_file) cur_text = "" link_pattern = re.compile("http\S+") for idx in range(len(df["text"])): if df["isRetweet"][idx] == "t": continue if 'http' in df["text"][idx]: sample = re.sub(link_pattern, "", df["text"][idx]) else: sample = df["text"][idx].replace("\n", " ") cur_text += sample.replace("\n", " ") data_model = markovify.Text(cur_text, state_size=3) model_json = data_model.to_json() with open("model.json", 'w') as f: f.write(model_json)
def execute(self, text): # Build the model. if Configuration.config["activate_nlp"]: SpacyText.load_dict() text_model = SpacyText(text) else: text_model = markovify.Text(text) body = "" sentences = [] # Create randomly-generated sentences for i in range(Configuration.config["sentences_to_generate"]): sentence = text_model.make_sentence() if sentence is not None and sentence not in sentences: body += sentence sentences.append(sentence) return body
def get_model(corpus): file_id = re.findall("/(.+?)_", corpus)[0] model_path = "models/" + file_id + "_markov_model.json" if not path.exists(model_path): # if model doesn't exist, make it # Get raw text as string with open(corpus) as f: text = f.read() # Build the model text_model = markovify.Text(text) model_json = text_model.to_json() with open(model_path, "wb") as o: # store json.dump(model_json, o) else: with open(model_path, "rb") as f: # retrieve text_model = markovify.Text.from_json(json.load(f)) return text_model
def random_sample_text(num_sentences=3): """Attempts to generate a random piece of text content.""" text = getattr(g, 'sample_text_cached', None) is_decoding_needed = False if not text: url = random.choice( app.config['EDITABLE_SAMPLE_TEXT_SCRAPE_URLS']) r = requests.get(url) # Thanks to: # https://github.com/kennethreitz/requests/issues/1604 ... # #issuecomment-24476927 r.encoding = 'utf-8' is_decoding_needed = (type(r.text).__name__ == 'unicode') text = ( is_decoding_needed and r.text.encode('utf-8') or r.text) g.sample_text_cached = text # Use a Markov chain generator for random sentences based on # sample input (e.g. text of a book). # https://github.com/jsvine/markovify # See also: # http://agiliq.com/blog/2009/06/ ... # generating-pseudo-random-text-with-markov-chains-u/ text_model = markovify.Text(text) sentences = [] for i in range(num_sentences): s = text_model.make_sentence() if s: if is_decoding_needed: s = s.decode('utf-8') s = unidecode(s.strip()) sentences.append(s) return '<p>{0}</p>'.format(' '.join(sentences))
def getContent(synopsCountMax): # Get raw text as string. synops = "" short = "" with open("OUTPUT/synopsis.txt") as f: text = f.read() text_model = markovify.Text(text, state_size=3) while not synops: synops = (text_model.make_sentence()) print "produced synops" synopsCount = synops.count("") print synops remaining = eval("synopsCountMax - synopsCount") while eval("synopsCount > synopsCountMax"): print "too long, shortening" synops = (text_model.make_sentence()) synopsCount = synops.count("") remaining = eval("synopsCountMax - synopsCount") while eval("synopsCount < synopsCountMax"): short = "" print "making short" while not short: short = (text_model.make_short_sentence(remaining)) print short shortCount = short.count("") if short: synops += str(" " + short) synopsCount = synops.count("") remaining = eval("synopsCountMax - synopsCount") if synopsCount > 180: synopsCount = eval("synopsCountMax + 1") else: synopsCount = eval("synopsCountMax + 1") print "--------------" return synops
def trainModel(): global model print("Loading datas ...") df = pd.read_csv('messages.csv', engine='python', encoding='utf8') df.iloc[0] = ['Temps', 'Expediteur', 'Message'] df.columns = df.iloc[0] df = df.drop(df.index[0]) df['Message'] = df['Message'].str.lower() # converts to lowercase dfPerso = df[df.Expediteur == EXPEDITEUR_NAME] dfPerso.dropna() speeches = list(dfPerso['Message'].str.split('\n', expand=True).stack()) print("Training model ...") model = markovify.Text(speeches, state_size=2) print("Model create")
def main(): logging.basicConfig(level=logging.INFO) tag = "polityka" # download(tag) state_size = 3 model = markovify.Text(load(tag), state_size=state_size) # # op = model.make_sentence(tries=500, max_overlap_ratio=0.5) # start = random.randint(0, len(op.split()) - state_size) # start_state = " ".join(op.split()[start:start + state_size]) # print(start_state) # comment = model.make_sentence(init_state=start_state, tries=500, max_overlap_ratio=0.5) print(model.make_short_sentence(max_chars=1000, min_chars=100)) a = generate_message(model) print(a) print(len(a))
def text_to_model(tup): '''given an abstract, train a markov model the 1 will be used for weights, later''' _, text = tup try: # retain_original set to False to save lots of RAM text_model = markovify.Text(text, state_size=STATE_SIZE, \ retain_original=False) # class is not serializable, so extract json first # this makes a Text type object, so we coerce to str model_json = str(text_model.to_json()) # TODO: change key for category return _, model_json except: # TODO FIXME: many articles being lost due to illegal characters. see issue tracker. print("model skipped in text_to_model:", text[:50]) pass
def main(): with open("prideandprejudice.txt") as f: text = f.read() text_model = markovify.Text(text) novel = '' words = 0 while (words < 50000): line = text_model.make_sentence_with_start("Mr. Darcy") line = line.split() words += len(line) for word in line: novel += ''.join(word) + ' ' novel += '\n' with open("mrdarcyandmrdarcy.txt", "w") as f: f.write(novel)
def parse_sentence(self, sentence): emotion_analysis = te.get_emotion(str(sentence)) if emotion_analysis[self.opposite] > emotion_analysis[self.feeling]: # Re write it sentence = self.rewrite(sentence) elif emotion_analysis[self.feeling] > emotion_analysis[self.opposite]: # Take its words #self.words_markov = markovify.combine([self.words_markov,markovify.Text(str(sentence))],[1,1]) self.sentences.append(str(sentence)) self.markov_blob = markovify.Text(self.sentences) for word in sentence.words: #self.words.append(Word(word,te.get_emotion(word),self.feeling,self.opposite)) self.words.add_word(word) #for word in word_bag: #if word == self.SCREAM: # power surge? return sentence
def get_markov_model(): corpora_files = load_corpora() corpora_file_names = corpora_files.keys() markov_models = {key: markovify.Text(corpora_files[key], well_formed=False) for key in corpora_file_names} # Random weight from 0-2 with intervals of 0.1 random_weights = random.sample([x * 0.1 for x in range(0, 20)], len(corpora_file_names)) # Print out the weights of each text for the user print("RANDOM WEIGHTS:") for idx, key in enumerate(corpora_file_names): print(key + ": " + str(random_weights[idx])) print("") # Combine Markov Chains model_combo = markovify.combine(list(markov_models.values())) model_combo = model_combo.compile() print(model_combo.make_sentence()) return model_combo
def markov(filename): corpus = "" # Get raw text as string. with open(str(filename)) as f: #the filename contains the normal lyrics text = f.read() for line in text.split("\n"): if line != "": if line[-1] not in "!?.;)": corpus += line + ". " # Build the model. text_model = markovify.Text(corpus) neural_lyrics = "" for i in range(len(text.split("\n"))): neural_lyrics += ((str(text_model.make_sentence())[:-1])) neural_lyrics += ("\n") return neural_lyrics
def respond(text): text_model = markovify.Text(text.lower()) print(text.lower()) msg = None try: msg = text_model.make_sentence_with_start("i") except: False print(msg) if (msg == None): msg = text_model.make_sentence() print(msg) if (msg != None): return "Tbh, " + msg else: return "Tbh, I have nothing interesting to say"
def generate_markov_model(songs, model_state_size): """ Generates a harmonic markov model based off of all the hooktheory songs given Args: songs (list of HKTObject): A list HKTObjects representing the corpus model_state_size (int): The state size that should be used for the model Returns: A markov model that represents the entire harmonic corpus given """ markovs = [] for song in songs: #For each song the harmonic progression is put into a string #Each chord is sperated by a space text = "" for segment in song.segments: #print(segment) for chord in segment.chordsNoRest: #print(chord) text += chord.roman_basic + " " #print(chord.roman_basic) #Weird conversion here text = str(text) #We create a seperate model for every song and put them into a list if text != '': #print("Text: "+text) model = markovify.Text(text, state_size=model_state_size) markovs.append(model) #Then we combine all the models in the list #print(markovs) #print(markovs) combo = markovify.combine(markovs) return combo
def respond(interview_question): global index global transcript answer = '' # get question words wordlist = interview_question.split() # check if actually a question if not wordlist[-1].endswith('?') and wordlist[-1] != '?': return 'Please, let\'s stick to questions only.' # check if already asked if (len(transcript) >= 1): for i in transcript: if (transcript[i]['question'].lower() == interview_question.lower()): return interviewer + ', please. You already asked me that.' # check if common question common = is_common_question(interview_question) if (type(common) is str): return common # Build the model. text_model = markovify.Text(text) # generate response for i in range(random.randint(1, 5)): answer += text_model.make_sentence() # save question + response # transcript[str(index)]['answer'] = nswer transcript.update( {index: { 'question': interview_question, 'response': answer }}) # increment index + return response index += 1 return answer
async def user_markov_response(message): user = message.mentions[0] logger.info("found user {} for bottalk command".format(user)) sentences = u"" random_dt = random_date(message.channel) async for log in message.channel.history(limit=2000, after=random_dt): if log.author == user: sentences += log.clean_content + "\n" if len(sentences) == 0: await message.channel.send("I got nothing 🤷") return try: text_model = markovify.Text(sentences, well_formed=False) s = text_model.make_short_sentence(300, tries=50) if not s or len(s) < 1: s = "My apologies, I cannot quite grasp the essence of that user." await message.channel.send(s) except Exception as e: logger.error("Shat self: {}".format(e)) await message.channel.send("Sorry, I've just gone and shat myself.")
def generate_anek(self): corpus = "" try: aneks = session.query(Anek) for anek in aneks: corpus += f"{anek.text}\n" model = markovify.Text(corpus) anek = "" for i in range(randint(2, 6)): anek += model.make_short_sentence(500) + " " anek = anek.replace("–", "\n–") anek = anek.replace("- ", "\n- ") anek = anek.replace("—", "\n—") return anek except Exception as e: with open("log", "a") as log: log.write( f'[{time.ctime()}] [DeepAneks] Generation has failed. Exception {e.__class__} caught: retrying in 30 seconds...\n' ) time.sleep(30)
def gen_markov(f=None, u=None): if f: with open(f) as f: print(Fore.GREEN + '[+] Parsing text file.') text = f.read() elif u: print(Fore.GREEN + '[+] Parsing specified URL.') r = requests.get(u) text = get_text(r.text) try: print(Fore.GREEN + '[+] Generating a model.') model = markovify.Text(text) print(Fore.GREEN + '[+] Attempting to generate a Markov chain.') markov_text = model.make_short_sentence(280) return markov_text except UnboundLocalError: print(Fore.RED + '[!] Fatal error. Aborting process.')
def createText(nrSentences=10, nrWords=150): text_model2 = None for filename in glob.glob(os.path.join('DataSet', '*.txt')): with open(filename, encoding='utf8') as f: print(filename) try: text = f.read() except UnicodeDecodeError as uniDecErr: continue text_model = markovify.Text(text) if text_model2: text_model2 = markovify.combine([text_model, text_model2]) else: text_model2 = text_model for i in range(nrSentences): result = text_model2.make_short_sentence(nrWords) print(result) outp.insert(INSERT, result)
def main(): parser = argparse.ArgumentParser( description='generate a markov chain model from a corpus of text') parser.add_argument('--corpus', '-c', help='corpus file path', required=True) parser.add_argument('--output', '-o', help='output model path', required=True) args = parser.parse_args() with open(args.corpus, 'r') as fp: corpus = fp.read() model = markovify.Text(corpus) with open(args.output, 'w') as fp: fp.write(model.to_json())
def build_poems(scope_map: Dict[PoemType, Scope], poem_type: PoemType): scope = scope_map[poem_type] logger.info(f'Fetching text blob for "{poem_type.value}" scope.') response = requests.get(rabbit_text_endpoint, params={ 'from': format_date(scope.fr), 'until': format_date(scope.un) }, timeout=15) try: response.raise_for_status() except HTTPError as e: logger.warning(e) return if not response.text: logger.warning(f'Text blob empty for "{poem_type.value}", skipping.') return logger.info(f'Generating markov chain with "{poem_type.value}" scope.') text_model = markovify.Text(response.text) for i in range(10): poem = Poem([text_model.make_sentence() for _ in range(5)], datetime.utcnow().replace(tzinfo=timezone.utc)) logger.debug(json.dumps(poem.json(), indent=2)) response = requests.post(rabbit_poem_endpoint, json=poem.json(), headers={'X-Api-Key': rabbit_api_key}, params={'scope': poem_type.value}, timeout=15) try: response.raise_for_status() except HTTPError as e: logger.warning(e)
def mk_trainer(bot_data="training", bot_model="bot_1"): mk_model_delete(bot_model) # This determines the line count of all the training files in the training folder. file_count_total = 0 training_file_names = [] for filename in os.listdir(f'{dataset_dir}/{bot_data}'): if filename.endswith(".txt"): training_file_names.append(filename) file_count_total += 1 else: continue if not training_file_names: raise Exception( f"No files in the {dataset_dir}/{bot_data} folder, please add .txt files with line-by-line conversations" ) with tqdm(total=file_count_total, postfix=text_color("Training MK Model", BIPur), leave=True, ascii=load_bar_mode, colour=load_bar_colour, dynamic_ncols=True) as progress_bar: combined_model = None for filename in training_file_names: with open(f'{dataset_dir}/{bot_data}/{filename}', encoding="ISO-8859-1") as f: model = markovify.Text(f, retain_original=False) if combined_model: combined_model = markovify.combine( models=[combined_model, model]) else: combined_model = model progress_bar.update(1) model_json = combined_model.to_json() with open(f'{models_dir}/{bot_model}/model.json', 'w') as outfile: json.dump(model_json, outfile)
def load_markov_model(fp, col, kind='list'): df = pd.read_excel(fp) # filter these df = df[df['related'] != 'Error'] corpus = df[col].tolist() if kind == 'list': line_level = '\n'.join( ['\n'.join(eval(article)) for article in corpus]) article_level = '\n'.join( [' '.join(eval(article)) for article in corpus]) line_level_model = mk.NewlineText(line_level, 4) recipe = [ str(i) + ': ' + line_level_model.make_sentence() for i in range(1, 6) ] print('\n'.join(recipe)) else: corpus = df[col].tolist() line_level = '\n'.join(corpus) article_level = '\n'.join(corpus) line_level_model = mk.Text(line_level, 4) with open('corpus\\' + col + ' - line level.txt', 'w') as fo: fo.write(line_level) with open('corpus\\' + col + ' - article level.txt', 'w') as fo: fo.write(article_level) article_level_model = mk.NewlineText(article_level, 4) print() print(line_level_model.make_sentence()) print(line_level_model.make_sentence()) print(line_level_model.make_sentence()) print(line_level_model.make_sentence()) print('_' * 20) print(article_level_model.make_sentence()) print(article_level_model.make_sentence()) print(article_level_model.make_sentence()) print(article_level_model.make_sentence()) print(article_level_model.make_sentence()) return line_level_model, article_level_model
async def babelli(ctx, arg, arg2): msg = "" key = arg url = "https://oaflopean.pythonanywhere.com/?key=" + key data = requests.post(url, auth=('oaflopean', 'babellibot')) text_model = markovify.Text(data.content.decode("utf-8")) for i in range(int(arg2)): try: msg = msg + " " + text_model.make_sentence() + " " except TypeError: continue print(data) if len(msg) == 0: await ctx.send("Sorry! Try more options.") else: chunks, chunk_size = len(msg), len(msg) / (len(msg) / 1995) list = [ msg[i:i + int(chunk_size)] for i in range(0, chunks, int(chunk_size)) ] for msg_pt in list: await ctx.send(msg_pt) book = Books() book.title = arg + " " + str(arg2) book.author = ctx.message.author.name book.description = msg s = "abcdefghijklmnopqrstuvwxyz" passlen = 12 book.uri = "".join(random.sample(s, passlen)) book.reddit_url = "http://oaflopean.pythonanywhere.com/?key=" + book.uri post = RedditPost(uri=book.uri, reddit_url=book.reddit_url, title=book.title, body=book.description, username=book.username) db.session.add(post) db.session.commit() db.session.add(book) db.session.commit() await ctx.send(book.reddit_url)
def createPoem(file): """ Using the Markofiy module to rearrange the poem into a new poem :param file: txt file :param type: object :return: newPoem """ with open(file) as f: text = f.read() text_model = markovify.Text(text, state_size=1) newtext = text_model.make_sentence() with open('/Users/hakeem/Desktop/LoremIpsumGen/app/results/results.txt', 'a') as f: newPoem = f.write(newtext + '\n') return newPoem
def make_padding(self): if self.dynamic: f = open(self.corpus, 'r') text = markovify.Text(f) self.logger.info('generating dynamic padding from corpus') pad = '<p style="font-size: 0px">' for i in range(1, 50): temp = text.make_sentence() if temp is not None: pad += ' ' + temp if i % 5 == 0: pad += ' </br>' else: pad += ' </br>' pad += ' </p>' self.logger.info('dynamic padding generated successfully') f.close() else: self.logger.warning('message created using static padding') pad = STATIC_PADDING return pad
def test_main(): """ Basic functional test """ assert markov_novel path = 'tmp' os.makedirs(path) os.chdir(path) # Get raw text as string. from os.path import dirname, abspath filename = os.path.join( dirname(dirname(abspath(__file__))), 'tests/futuristmanifest.txt') with open(filename) as f: text = f.read() # Build the model. text_model = markovify.Text(text) novel = markov_novel.Novel(text_model, chapter_count=1) novel.write(novel_title='my-novel', filetype='md') assert os.path.exists(os.path.join(os.getcwd(), 'my-novel.md')) os.chdir(os.pardir) shutil.rmtree('tmp', ignore_errors=True)
def tootmarkov(bot, trigger): keys = open(os.getcwd() + "/SECRET_SAUCE/masto.txt", "r") client_id = keys.readline().rstrip() client_secret = keys.readline().rstrip() access_token = keys.readline().rstrip() api_base_url = keys.readline().rstrip() mastodon = Mastodon(client_id, client_secret, access_token, api_base_url) keys.close() # Get raw text as string. f = open(os.getcwd() + "/all_of_bgtopics.txt", "r") text = f.read() # Build the model. text_model = markovify.Text(text) mytoot = text_model.make_short_sentence(140) output = mytoot + "\n\n[Generated by TootMarkov]" mastodon.toot(output) bot.say("I tooted: " + mytoot)
def markov_chain(): # Get raw text as string. with open("./data/corpora/text_1.txt", encoding='utf8') as f: text = f.read() # Build the model. text_model = markovify.Text(text) # Print five randomly-generated sentences print('\n\n---Print five randomly-generated sentences---') for i in range(5): print() print(text_model.make_sentence()) # Print three randomly-generated sentences of no more than 280 characters print( '\n\n---Print three randomly-generated sentences of no more than 280 characters---' ) for i in range(3): print() print(text_model.make_short_sentence(280))
def combine_all_files(ctx): import markovify state_size = ctx.config.get('state_size', 2) files = os.listdir(path="texts") with open("output_corpus.txt", 'w') as f: f.write('\n') files = files[1:-1] for file in files: with open(f"texts/{file}") as input_file: text = input_file.read() try: markovify.Text(text, state_size=state_size) with open("output_corpus.txt", "a") as output_corpus: output_corpus.write('\n' + text) logging.info('Added...') except Exception as e: logging.error(e)