Beispiel #1
0
def doTokenization():
    o = t.Tokenizer(
        ignoreList=['email', 'url', 'ellipses', 'punct', 'unicodeEmoji'])
    files = ["./Books/" + x for x in os.listdir("./Books/")]
    fileOut = open('books.tokens', "w+")
    for f in files:
        fileIn = codecs.open(f, encoding='utf-8')
        tokensList = []
        lines = ""
        for line in fileIn:
            o.tokenize(line.strip())
            tokens = o.getTokens()
            o.clearTokens()
            if len(tokens) != 0:
                tokensList.append(tokens)
        for line in tokensList:
            fileOut.write(str(line))
            fileOut.write("\n")
    fileOut.close()
    return tokensList
Beispiel #2
0
def doTokenization():
	o = t.Tokenizer( ignoreList= [ 'email', 'url', 'ellipses', 'punct', 'unicodeEmoji' ] )
	fileIn = codecs.open('twitter.dump', encoding='utf-8')
	fileOut = open('twitter.tokens', "w+")
	tokensList = []
	lines = ""
	for line in fileIn:
		o.tokenize(line.strip())
		temp = o.getTokens();
		o.clearTokens();
		if len(temp) != 0:
			tokens = [ "<s>" ]
			tokens.extend( temp )
			tokens.extend( ["</s>"] )
			tokensList.append( tokens )
	for line in tokensList:
		fileOut.write( str(line) )
		fileOut.write( "\n" )
	fileOut.close()
	return tokensList
Beispiel #3
0
# -*- coding: utf-8 -*-

import tweetTokenizer as t

o = t.Tokenizer()
s = "I don’t Whats' should'ntI do, Boom-boom be happy with lies or be sad with the truth 😔😩🤔"
# s = "What should I do, be happy with lies or be s😔d with the truth 😔😩🤔"
# s = "This is@a_tweet@for12Testing#the#tokens#testing:'(with :)with:D#happy#face:PLOL"
# s = "This is@a_tweet@for12Testing#the#tokens#testing:'(with :)with:D#happy#face:PLOL http://go.co.in/asdaaSD23/43sedf_sad [email protected]"
# s = "Ignorance [email protected] is to cope man, ignorance is bliss, ignorance is love and I need that shit R.I.P"
# s = "I don’t know why we’re not leading by a lot."
# s = "F**k:(!!!!!"
print s
s = s.decode('utf8')
print s
o.tokenize(s)
print o.getTokens()
"""
python PlotZipf.py n director1 director2 director3 ...
n 			-> 	n in n-grams
director1	->	directory containing utf-8 books
"""

import sys, os, codecs, math
import pickle as p
from collections import OrderedDict
from matplotlib import pyplot as ppl

# For importing the Tokenizer.
sys.path.insert(0, '../2')

import tweetTokenizer as t
o = t.Tokenizer(
    ignoreList=['email', 'url', 'ellipses', 'punct', 'unicodeEmoji'])


def doTokenization(inpDir):
    if inpDir[-1] != '/':
        inpDir += '/'
    files = [inpDir + x for x in os.listdir(inpDir)]
    for f in files:
        fileIn = codecs.open(f, encoding='utf-8')
        tokensList = []
        lines = ""
        for line in fileIn:
            o.tokenize(line.strip())
            tokens = o.getTokens()
            o.clearTokens()
            if len(tokens) != 0: