def test_basic(self): self.assertEqual(markov.tokenize('simple'), ['simple']) self.assertEqual(markov.tokenize(' simple '), ['simple']) self.assertEqual(markov.tokenize('SiMpLe'), ['simple']) self.assertEqual(markov.tokenize('two words'), ['two', 'words']) self.assertEqual(markov.tokenize('one more word'), ['one', 'more', 'word'])
def makeGuide(): sourceList = pull_data.get_n_articles(articles) titleStarts, stepStarts = pull_data.findStartingWordsAndUrls( sourceList, order) temp = pull_data.make_plaintext(sourceList) tokens = markov.tokenize(temp) weights = markov.makeWeights(tokens, order) artTitles = [] for item in sourceList: if item[0] == pull_data.textType.TITLE: artTitles.append(item[1]) text = '' #print("title") startWords = random.choice(titleStarts) numSentences = 1 minWords = titles_min_words text = text + markov.formatOutput( markov.makeText(weights, startWords, numSentences, True, order, titles_rec_size, titles_min_words)) text = text[0:len(text) - 1] + ":\n" for i in range(steps): #print("summary") startWords = random.choice(stepStarts) numSentences = steps_sen minWords = steps_min_words text = text + str(i + 1) + ": " + markov.formatOutput( markov.makeText(weights, startWords, numSentences, True, order, steps_rec_size, steps_min_words)) + "\n" text = text + '\n' return pull_data.remove_urls(text), artTitles
def findStartingWordsAndUrls(textList, ord=1): titleStarts = [] abstractStarts = [] methodStarts = [] summaryStarts = [] textStarts = [] urls = [] for item in textList: #print(item[1]) splitText = markov.tokenize(item[1]) newStartWords = [] #If url, get it and continue if item[0] == textType.URL: urls.append(item[1]) continue #skip over very short starting phrases if len(splitText) < ord: continue for i in range(ord): newStartWords.append(splitText[i]) newTuple = tuple(newStartWords) if item[0] == textType.TITLE: titleStarts.append(newTuple) elif item[0] == textType.ABSTRACT: abstractStarts.append(newTuple) elif item[0] == textType.METHOD: methodStarts.append(newTuple) elif item[0] == textType.SUMMARY: summaryStarts.append(newTuple) elif item[0] == textType.TEXT: textStarts.append(newTuple) return titleStarts, abstractStarts, methodStarts, summaryStarts, textStarts, urls
def makeGuide(): sourceList = pull_data.get_n_articles(articles) tStarts, aStarts, mStarts, sStarts, xStarts, urls = pull_data.findStartingWordsAndUrls( sourceList, order) text = pull_data.make_plaintext(sourceList) tokens = markov.tokenize(text) weights = markov.makeWeights(tokens, order) artTitles = [] for item in sourceList: if item[0] == pull_data.textType.TITLE: artTitles.append(item[1]) text = '' for i in range(titles): #print("title") startWords = random.choice(tStarts) numSentences = titles_sen minWords = titles_min_words text = text + markov.formatOutput( markov.makeText(weights, startWords, numSentences, True, order, titles_rec_size, titles_min_words)) + "\n" #text = text +'\n' for i in range(abstracts): #print("abstract") startWords = random.choice(aStarts) numSentences = abstracts_sen minWords = abstracts_min_words text = text + markov.formatOutput( markov.makeText(weights, startWords, numSentences, True, order, abstracts_rec_size, abstracts_min_words)) + "\n" #text = text +'\n' for i in range(methods): #print("method") startWords = random.choice(mStarts) numSentences = methods_sen minWords = methods_min_words text = text + markov.formatOutput( markov.makeText(weights, startWords, numSentences, True, order, methods_rec_size, methods_min_words)) + "\n" text = text + '\n' for i in range(summaries): #print("summary") startWords = random.choice(sStarts) numSentences = summaries_sen minWords = summaries_min_words text = text + markov.formatOutput( markov.makeText(weights, startWords, numSentences, True, order, summaries_rec_size, summaries_min_words)) + "\n" text = text + '\n' for i in range(steps): #print("step") startWords = random.choice(xStarts) numSentences = steps_sen minWords = steps_min_words text = text + markov.formatOutput( markov.makeText(weights, startWords, numSentences, True, order, steps_rec_size, steps_min_words)) + "\n" return text, urls, artTitles
def findStartingWordsAndUrls(textList, ord=1): titleStarts = [] textStarts = [] for item in textList: #print(item[1]) splitText = markov.tokenize(item[1]) newStartWords = [] #skip over very short starting phrases if len(splitText) < ord: continue for i in range(ord): newStartWords.append(splitText[i]) newTuple = tuple(newStartWords) if item[0] == textType.TITLE: titleStarts.append(newTuple) elif item[0] == textType.TEXT: textStarts.append(newTuple) return titleStarts, textStarts
# import sys import markov from flask import Flask, render_template app = Flask(__name__) source_text = "corpus.txt" cleaned_text = markov.cleanup(source_text) text_list = markov.tokenize(cleaned_text) # create a data structure to hold random_sentences @app.route('/', methods=['GET', 'POST']) def index(): # perhaps change this variable to random_sentences final_sentence = markov.main(text_list) # then push that sententence into your data structure # then create an algorithm to grab a random index from the data structure # set that to final_sentence '''NOTE: once your page has been loaded 1000's of times, your data structure of going to be huge. Time complexity is still O(1), but it could start to take up lots of space. If it gets to that, consider migrating your data to a database, and doing a fetch <~~~ don't optimize for this until you need to. ''' return render_template('index.html', final_sentence=final_sentence) if __name__ == '__main__': index()
def test_ignored(self): for char in markov.ignored_chars: word = 'abc{}xyz'.format(char) self.assertEqual(markov.tokenize(word), ['abc', 'xyz'])
def test_special(self): for char in markov.special_chars: word = 'abc{}xyz'.format(char) self.assertEqual(markov.tokenize(word), ['abc', char, 'xyz'])