Ejemplos de tokenize en Python, ejemplos de markov.tokenize en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: tests.py Proyecto: petermlm/Markov

 def test_basic(self):
     self.assertEqual(markov.tokenize('simple'), ['simple'])
     self.assertEqual(markov.tokenize('  simple  '), ['simple'])
     self.assertEqual(markov.tokenize('SiMpLe'), ['simple'])
     self.assertEqual(markov.tokenize('two words'), ['two', 'words'])
     self.assertEqual(markov.tokenize('one more word'),
                      ['one', 'more', 'word'])

Ejemplo n.º 2

0

Mostrar archivo

def makeGuide():
    sourceList = pull_data.get_n_articles(articles)
    titleStarts, stepStarts = pull_data.findStartingWordsAndUrls(
        sourceList, order)
    temp = pull_data.make_plaintext(sourceList)
    tokens = markov.tokenize(temp)
    weights = markov.makeWeights(tokens, order)

    artTitles = []
    for item in sourceList:
        if item[0] == pull_data.textType.TITLE: artTitles.append(item[1])
    text = ''
    #print("title")
    startWords = random.choice(titleStarts)
    numSentences = 1
    minWords = titles_min_words
    text = text + markov.formatOutput(
        markov.makeText(weights, startWords, numSentences, True, order,
                        titles_rec_size, titles_min_words))
    text = text[0:len(text) - 1] + ":\n"
    for i in range(steps):
        #print("summary")
        startWords = random.choice(stepStarts)
        numSentences = steps_sen
        minWords = steps_min_words
        text = text + str(i + 1) + ": " + markov.formatOutput(
            markov.makeText(weights, startWords, numSentences, True, order,
                            steps_rec_size, steps_min_words)) + "\n"
    text = text + '\n'

    return pull_data.remove_urls(text), artTitles

Ejemplo n.º 3

0

Mostrar archivo

def findStartingWordsAndUrls(textList, ord=1):
    titleStarts = []
    abstractStarts = []
    methodStarts = []
    summaryStarts = []
    textStarts = []
    urls = []
    for item in textList:
        #print(item[1])
        splitText = markov.tokenize(item[1])
        newStartWords = []
        #If url, get it and continue
        if item[0] == textType.URL:
            urls.append(item[1])
            continue
        #skip over very short starting phrases
        if len(splitText) < ord: continue
        for i in range(ord):
            newStartWords.append(splitText[i])
            newTuple = tuple(newStartWords)
        if item[0] == textType.TITLE:
            titleStarts.append(newTuple)
        elif item[0] == textType.ABSTRACT:
            abstractStarts.append(newTuple)
        elif item[0] == textType.METHOD:
            methodStarts.append(newTuple)
        elif item[0] == textType.SUMMARY:
            summaryStarts.append(newTuple)
        elif item[0] == textType.TEXT:
            textStarts.append(newTuple)
    return titleStarts, abstractStarts, methodStarts, summaryStarts, textStarts, urls

Ejemplo n.º 4

0

Mostrar archivo

def makeGuide():
    sourceList = pull_data.get_n_articles(articles)
    tStarts, aStarts, mStarts, sStarts, xStarts, urls = pull_data.findStartingWordsAndUrls(
        sourceList, order)
    text = pull_data.make_plaintext(sourceList)
    tokens = markov.tokenize(text)
    weights = markov.makeWeights(tokens, order)

    artTitles = []
    for item in sourceList:
        if item[0] == pull_data.textType.TITLE: artTitles.append(item[1])

    text = ''
    for i in range(titles):
        #print("title")
        startWords = random.choice(tStarts)
        numSentences = titles_sen
        minWords = titles_min_words
        text = text + markov.formatOutput(
            markov.makeText(weights, startWords, numSentences, True, order,
                            titles_rec_size, titles_min_words)) + "\n"
    #text = text +'\n'
    for i in range(abstracts):
        #print("abstract")
        startWords = random.choice(aStarts)
        numSentences = abstracts_sen
        minWords = abstracts_min_words
        text = text + markov.formatOutput(
            markov.makeText(weights, startWords, numSentences, True, order,
                            abstracts_rec_size, abstracts_min_words)) + "\n"
    #text = text +'\n'
    for i in range(methods):
        #print("method")
        startWords = random.choice(mStarts)
        numSentences = methods_sen
        minWords = methods_min_words
        text = text + markov.formatOutput(
            markov.makeText(weights, startWords, numSentences, True, order,
                            methods_rec_size, methods_min_words)) + "\n"
    text = text + '\n'
    for i in range(summaries):
        #print("summary")
        startWords = random.choice(sStarts)
        numSentences = summaries_sen
        minWords = summaries_min_words
        text = text + markov.formatOutput(
            markov.makeText(weights, startWords, numSentences, True, order,
                            summaries_rec_size, summaries_min_words)) + "\n"
    text = text + '\n'
    for i in range(steps):
        #print("step")
        startWords = random.choice(xStarts)
        numSentences = steps_sen
        minWords = steps_min_words
        text = text + markov.formatOutput(
            markov.makeText(weights, startWords, numSentences, True, order,
                            steps_rec_size, steps_min_words)) + "\n"

    return text, urls, artTitles

Ejemplo n.º 5

0

Mostrar archivo

def findStartingWordsAndUrls(textList, ord=1):
    titleStarts = []
    textStarts = []
    for item in textList:
        #print(item[1])
        splitText = markov.tokenize(item[1])
        newStartWords = []
        #skip over very short starting phrases
        if len(splitText) < ord: continue
        for i in range(ord):
            newStartWords.append(splitText[i])
            newTuple = tuple(newStartWords)
        if item[0] == textType.TITLE:
            titleStarts.append(newTuple)
        elif item[0] == textType.TEXT:
            textStarts.append(newTuple)
    return titleStarts, textStarts

Ejemplo n.º 6

0

Mostrar archivo

Archivo: app.py Proyecto: PhyllisWong/NotreDameTweetGen

# import sys
import markov
from flask import Flask, render_template

app = Flask(__name__)

source_text = "corpus.txt"
cleaned_text = markov.cleanup(source_text)
text_list = markov.tokenize(cleaned_text)
# create a data structure to hold random_sentences

@app.route('/', methods=['GET', 'POST'])
def index():
    # perhaps change this variable to random_sentences
    final_sentence = markov.main(text_list)
    # then push that sententence into your data structure
    # then create an algorithm to grab a random index from the data structure
    # set that to final_sentence

    '''NOTE: once your page has been loaded 1000's of times, your data structure
    of going to be huge. Time complexity is still O(1), but it could start to take
    up lots of space. If it gets to that, consider migrating your data to a
    database, and doing a fetch <~~~ don't optimize for this until you need to.
    '''
    return render_template('index.html', final_sentence=final_sentence)

if __name__ == '__main__':
    index()

Ejemplo n.º 7

0

Mostrar archivo

Archivo: tests.py Proyecto: petermlm/Markov

 def test_ignored(self):
     for char in markov.ignored_chars:
         word = 'abc{}xyz'.format(char)
         self.assertEqual(markov.tokenize(word), ['abc', 'xyz'])

Ejemplo n.º 8

0

Mostrar archivo

Archivo: tests.py Proyecto: petermlm/Markov

 def test_special(self):
     for char in markov.special_chars:
         word = 'abc{}xyz'.format(char)
         self.assertEqual(markov.tokenize(word), ['abc', char, 'xyz'])