def doTokenization(): o = t.Tokenizer( ignoreList=['email', 'url', 'ellipses', 'punct', 'unicodeEmoji']) files = ["./Books/" + x for x in os.listdir("./Books/")] fileOut = open('books.tokens', "w+") for f in files: fileIn = codecs.open(f, encoding='utf-8') tokensList = [] lines = "" for line in fileIn: o.tokenize(line.strip()) tokens = o.getTokens() o.clearTokens() if len(tokens) != 0: tokensList.append(tokens) for line in tokensList: fileOut.write(str(line)) fileOut.write("\n") fileOut.close() return tokensList
def doTokenization(): o = t.Tokenizer( ignoreList= [ 'email', 'url', 'ellipses', 'punct', 'unicodeEmoji' ] ) fileIn = codecs.open('twitter.dump', encoding='utf-8') fileOut = open('twitter.tokens', "w+") tokensList = [] lines = "" for line in fileIn: o.tokenize(line.strip()) temp = o.getTokens(); o.clearTokens(); if len(temp) != 0: tokens = [ "<s>" ] tokens.extend( temp ) tokens.extend( ["</s>"] ) tokensList.append( tokens ) for line in tokensList: fileOut.write( str(line) ) fileOut.write( "\n" ) fileOut.close() return tokensList
# -*- coding: utf-8 -*- import tweetTokenizer as t o = t.Tokenizer() s = "I don’t Whats' should'ntI do, Boom-boom be happy with lies or be sad with the truth 😔😩🤔" # s = "What should I do, be happy with lies or be s😔d with the truth 😔😩🤔" # s = "This is@a_tweet@for12Testing#the#tokens#testing:'(with :)with:D#happy#face:PLOL" # s = "This is@a_tweet@for12Testing#the#tokens#testing:'(with :)with:D#happy#face:PLOL http://go.co.in/asdaaSD23/43sedf_sad [email protected]" # s = "Ignorance [email protected] is to cope man, ignorance is bliss, ignorance is love and I need that shit R.I.P" # s = "I don’t know why we’re not leading by a lot." # s = "F**k:(!!!!!" print s s = s.decode('utf8') print s o.tokenize(s) print o.getTokens()
""" python PlotZipf.py n director1 director2 director3 ... n -> n in n-grams director1 -> directory containing utf-8 books """ import sys, os, codecs, math import pickle as p from collections import OrderedDict from matplotlib import pyplot as ppl # For importing the Tokenizer. sys.path.insert(0, '../2') import tweetTokenizer as t o = t.Tokenizer( ignoreList=['email', 'url', 'ellipses', 'punct', 'unicodeEmoji']) def doTokenization(inpDir): if inpDir[-1] != '/': inpDir += '/' files = [inpDir + x for x in os.listdir(inpDir)] for f in files: fileIn = codecs.open(f, encoding='utf-8') tokensList = [] lines = "" for line in fileIn: o.tokenize(line.strip()) tokens = o.getTokens() o.clearTokens() if len(tokens) != 0: