Python tokenize Examples

Programming Language: Python

Namespace/Package Name: textminer

Method/Function: tokenize

Examples at hotexamples.com: 6

Python tokenize - 6 examples found. These are the top rated real world Python examples of textminer.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def w_count(text):
    """
    raw word counts for full document
    """
    tokens = tm.tokenize(text, casefold=True)
    output = dict([(token, tokens.count(token)) for token in set(tokens)])
    return output

Example #2

Show file

        for s in text.split('\n\n'):
            if s:
                paragraph = s  #.lower()
                #paragraph = re.sub(r'\W',' ',paragraph)
                #paragraph = re.sub(r'\d',' ',paragraph)
                paragraph = re.sub(r'[^A-Za-z]', ' ', paragraph)
                paragraph = re.sub(r' +', ' ', paragraph)
                paragraphs.append(paragraph.rstrip())
    return paragraphs


paragraphs = get_para(filepath)
print paragraphs[0]

### parts of speech tagging
print pos_tag(tm.tokenize(paragraphs[100]), tagset='universal', lang='eng')

# monster tokenizer that includes specific part of speech
i = 0
para_token = []
for paragraph in paragraphs:
    print i
    tokens = tm.tokenize(paragraph, length=1, casefold=False)
    tagset = pos_tag(tokens, tagset='universal', lang='eng')
    tokens = [tag[0] for tag in tagset if tag[1] in ['NOUN']]
    tokens = [token.lower() for token in tokens]
    para_token.append(tokens)
    i += 1

print para_token[100]
# generate stopword list from text

Example #3

Show file

for sermon in sermons:
        date = date_reg.findall(sermon)
        dates.append(date)

month = []
for date in dates:
    try:
        month.append(date[0][4:6])
    except:
        month.append('FALSE')

#tokenize list with sermons using the function in tm called tokenize. We lowercase everything and are not interested in allcaps..

tokenized_sermons=[]
for i in sermons:
    tokenized_sermons.append(tm.tokenize(i.lower()))
    
#Use pruning to remove unwanted words:
prune = tm.prune_multi(tokenized_sermons, 50, 500)
    
#Alternatively you can use a stopword-list
#Create and apply stopword-list
sw = tm.gen_ls_stoplist(tokenized_sermons, 250) #How many words do we want to delete
sermons_nosw = []
for sermon in tokenized_sermons: #For each sermon:
    nosw_sermon = [] #Create empty list
    nosw_sermon =[token for token in sermon if token not in sw] #Fill the empty list with the words not in sw
    sermons_nosw.append(nosw_sermon) #Add the created list to sermons_nosw

Example #4

Show file

###

#### Section 1.2 #### 

df = pd.read_csv('fake_or_real_news_cleaned_sent.csv', encoding = 'utf-8')
print df.label.value_counts() #balanced dataset (approx 3000 of each)
print df.loc[1]


#MAKING A TOPIC MODEL DATAFRAME
 #defining a working df  - change this when we want to work with all of the texts
tp_df = df
#insert articles into a list
texts_tokenized = []
for text in tp_df['text_clean']:
    tokens = tm1.tokenize(text, length = 1, casefold = False) #casefold equal false because we want uppercase letters to categorize the text using pos_tag
    tagset = pos_tag(tokens, tagset = 'universal', lang = 'eng') #tag tokens with their category
    tokens = [tag[0] for tag in tagset if tag[1] in ['NOUN']] #only retain nouns
    tokens = [token.lower() for token in tokens] #lowercase the tokens
    texts_tokenized.append(tokens)
print type(texts_tokenized[0][0])  #the word in the text
print type(texts_tokenized[0])  #list of words in text
print type(texts_tokenized) #list of texts
#So it is a string within a list within a list (the first list is the text, the second list the nouns in the text and the string is the noun)
    #making a stopwordlist
sw = tm1.gen_ls_stoplist(texts_tokenized, 40)
print sw #this stopword might say some general things about the period of the articles rather than something about the topics
#for now let's just not use it

"""
#applying stopword list to all texts#

Example #5

Show file

# use regex to identify START and END of Gutenberg text
pat1 = r'\*{3} STAR(.*?)\*{3}'
pat2 = r'\*{3} END(.*?)\*{3}'
start_idx = [(m.start(0), m.end(0)) for m in re.finditer(pat1, text)]
end_idx = [(m.start(0), m.end(0)) for m in re.finditer(pat2, text)]

# print start string of Gutenberg text
print(text[start_idx[0][0]:start_idx[0][1]])
idx1 = start_idx[0][1] + 1  # beginning of content
idx2 = end_idx[0][0]  # end of content

# extract text content and assign to variable
content = text[idx1:idx2]
print(content[:100])

tokens = tm.tokenize(content, lentoken=1)
print(tokens[:100])


def slice_tokens(tokens, n=100, cut_off=True):
    # result: list of slices
    slices = []
    # slice tokens
    for i in range(0, len(tokens), n):
        slices.append(tokens[i:(i + n)])
    #cut_off function
    if cut_off:
        del slices[-1]
    return slices

Example #6

Show file

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
from __future__ import division
import os
import matplotlib.pyplot as plt
# set working directory
os.chdir(os.path.expanduser('~/Documents/tmgu17/scripts'))
import textminer as tm
## get data
text_ls, text_names = tm.read_dir_txt('data/')
text = text_ls[3]
tokens = tm.tokenize(text, casefold=True)
## tag could from tokenized text
from wordcloud import WordCloud
# help(WordCloud)# for more information


def tag_cloud(tokens, stop_set=None):
    wc = WordCloud(stopwords=stop_set).generate(' '.join(tokens))
    plt.figure(figsize=(12, 12), dpi=200)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    #plt.savefig('wordcloud.png',bbox_inches='tight')
    plt.show()
    plt.close()


# run
tag_cloud(tokens)