Example #1
0
def displayPageView(request):
    mycursor.execute('TRUNCATE table logs_c')
    filePath = request.GET['input-file']
    filePath = "C:/Users/Rhishabh/Documents/mithi hackathon/" + filePath
    log = readfile(filePath)

    line = log.readline()
    tk = SpaceTokenizer()
    tokens = tk.tokenize(line)
    while line:
        tokens = tk.tokenize(line)
        process(tokens)
        line = log.readline()

    mydb.commit()
    
    result1 = query_1()
    result2 = query2()
    result3 = query3()
    result4 = query4()
    result5 = query5()
    result7 = query7()

    # mydb.close()
    temp = [['test', 'test'], ['test', 'test']]
    test = 'sdsds'
    return render(request, 'display.htm', {'ipfile': filePath, 'result1': result1, 'result2': result2, 'result3': result3, 'result4': result4, 'result5': result5, 'result7': result7})
Example #2
0
def tokenization(corpus, stop_words=nltk.corpus.stopwords.words('portuguese')):
    '''Input : corpus é uma Serie de corpusumentos(frases)
       Output : Uma lista de listas com palavras 
    
    stop_words : lista de palavras que devem ser removidas
    '''

    #Tokenizacao
    spacetok = SpaceTokenizer()
    corpus = [spacetok.tokenize(phrases) for phrases in corpus]

    #stopwords
    if (stop_words != None):
        tmp_corpus = list()
        tmp_words = list()

        for phrases in corpus:
            for word in phrases:
                if (word not in stop_words):
                    tmp_words.append(word)
                else:
                    pass
            tmp_corpus.append(tmp_words)
            tmp_words = list()

        corpus = tmp_corpus
    else:
        pass

    return corpus
class NLTKSpaceTokenizeBody(BaseEnrichment):
    def __init__(self):
        self.tokenizer = SpaceTokenizer()
    def enrichment_value(self,tweet):
        return self.tokenizer.tokenize(tweet['body'])
    def __repr__(self):
        return "Use the NLTK SpaceTokenizer to parse the Tweet body."
class NLTKSpaceTokenizeBody(BaseEnrichment):
    """Use the NLTK SpaceTokenizer to parse the Tweet body."""
    def __init__(self):
        self.tokenizer = SpaceTokenizer()

    def enrichment_value(self, tweet):
        return self.tokenizer.tokenize(tweet['body'])
Example #5
0
def extract_name(tweet):
    token = SpaceTokenizer()
    toks = token.tokenize(tweet)
    pos = pos_tag(toks)
    chunked_nes = ne_chunk(pos)
    nes = [
        ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes
        if isinstance(ne, nltk.tree.Tree)
    ]
    return nes
Example #6
0
def extract_entities(text):
	entities = []
	for sentence in sent_tokenize(text):
            tokenizer = SpaceTokenizer()
            toks = tokenizer.tokenize(sentence)
            default_tagger = nltk.data.load(nltk.tag._POS_TAGGER)
            #model = {'_': 'RB','shikha':'NNP','Lots':'','bbnt':'NNP','Swati':'NNP','Sarkar':'NNP','Deepak':'NNP','Capgemini':'NNP','Swati':'NNP','Deepak Shete':'NNP','Melini':'NNP','Lots':'RB','Prashant Deshpande':'NNP','Deepak A. Shete':'NNP','Rajesh Achyut Patankar':'NNP','Shailesh V. Naik':'NNP','Prashant':'NNP','Kuldeep Vishnu Deshpande':'NNP','Kuldeep Deshpande':'NNP','Hi':'UH','From':'IN','Subject':'VB','RE':'SYM','Cc':'SYM','CC':'SYM','Start':'RB','All':'RB','PLEASE':'RB','Request':'RB','Add':'RB','Need':'RB','Completed':'VB','To':'RB','Dear':'RB','Thank':'RB','You':'PRP','We':'PRP','Here':'RB','Team':'RB','Please':'UH','Thanks':'UH','Regards':'UH','See':'VB','Test':'VB','ASAP':'SYM','Sent':'VB','mailto':'SYM','Together':'RB','Is':'VB','AS':'RB','Financial Services Strategic Business Unit':'NNP','fax':'RB','mobile':'RB','except':'RB','date':'RB','new':'RB','courier':'RB','extn':'RB'}
	    model =  {'extn':'RB'}
            tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger)
            pos = pos_tag(toks)
            pos=tagger.tag(toks)
            #print pos
            chunks = ne_chunk(pos) 
	    #chunks = ne_chunk(pos_tag(word_tokenize(sentence)))
	    entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')])
	return entities



#with open("D:/R/BOA/PySrc/FGD1_18-25_Vodafone_Prepaid_BCUsers_Mumbai.csv", "r") as csvfile:
        datareader = csv.reader(csvfile,quotechar='"' ,lineterminator='\n',quoting=csv.QUOTE_ALL)
        csv_out = open('D:/R/BOA/Noun/FNoun.csv', 'wb')
	mywriter = csv.writer(csv_out)
	count=0
	for row in datareader:
				count = count + 1
				print "COUNT is :%d" % count
                                print row(''.join(row))
				#mywriter.writerow(extract_entities(''.join(row)))


	#csv_out.close()	
	
	file = open('D:/R/BOA/txtfiles/FGD1_18-25_Vodafone_Prepaid_BCUsers_Mumbai.txt', 'r')
	print file.read()
	filew = open('D:/R/BOA/Noun/FNoun.txt', "w")
	for line in file:
                                print line
                                filew.write(extract_entities(line))
                                #filew.write("yeah its me")



        filew.close()
Example #7
0
# 3장 전처리 - 토큰화-NLTK 내장 토크나이저 사용법
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

# LineTokenizer 사용('줄'로 나누기)
lTokenizer = LineTokenizer()
print(
    "Line toknizer 출력 :",
    lTokenizer.tokenize(
        "My name is" +
        "Maximus Decimus Meridius, commander of the Armies of the North, " +
        "General of the Felix Legions and loyal servant to the true emperor," +
        "Marcus Aurlius. \nFather to a murdered son, husband to a murdered" +
        "wife. \nAnd I will have my vengeance, in this life or the next."))

# SpaceTokenizer 사용('공백 문자'로 나누기)
rawText = "By 11 o'clock on sunday, the doctor shall open the dispensary."
sTokenizer = SpaceTokenizer()
print("Space Tokenizer 출력 :", sTokenizer.tokenize(rawText))

# word_tokenize 사용('단어'와 '구두점' 나누기)
print("word Tokenizer 출력 :", word_tokenize(rawText))

# TweetTokenizer 사용('특수문자'를 다룰 때 사용)
tTokenizer = TweetTokenizer()
print("Tweet Tokenizer 출력 :",
      tTokenizer.tokenize("This is a coooool" + "#dummysmiley: :-) :-P <3"))
Example #8
0
# In[14]:

from nltk.tokenize import SpaceTokenizer
tm = SpaceTokenizer()
to_rank = []
key_words = []

for i in range(len(ranked_q)):
    yn = 0

    #ranked_q[i][yn]
    question[i] = untokenize(question[i])

    yy = "_____"
    to_rank.append(tm.tokenize(ranked_q[i][0]))
    print("Q:", question[i].replace(to_rank[i][len(to_rank[i]) // 2], yy))
    print('Ans - ', to_rank[i][len(to_rank[i]) // 2])
    #quita = question[i].index(to_rank[i][len(to_rank[i])//2])

    #key_words.append(question[i][quita])

#print(to_rank[0][len(to_rank[0])//2])

#question[0].remove(question[0][quita])

#question[0][quita] = to_rank[0][len(to_rank[0])//2]
#print(question[0][quita])

# In[ ]:
from nltk.corpus import stopwords
from functools import partial
#from gensim import corpora
#from gensim.models import TfidfModel
import re

# initialize the instances for various NLP tools
tokenizer = SpaceTokenizer()
stemmer = PorterStemmer()
 
# define steps
pipeline = [lambda s: re.sub('[\n]', '', s),
            lambda s: re.sub('[^\w\s]', '', s),
            lambda s: re.sub('[\d\n]', '', s),
            lambda s: s.lower(),
            lambda s: ' '.join(filter(lambda s: not (s in stopwords.words('english')), tokenizer.tokenize(s))),
            lambda s: ' '.join(map(lambda t: stemmer.stem(t), tokenizer.tokenize(s)))
           ]
 
# function that carries out the pipeline step-by-step
def preprocess_text(text, pipeline):
    if len(pipeline)==0:
        return text
    else:
        return preprocess_text(pipeline[0](text), pipeline[1:])
 
#This section reads in documents from the selected corpus as real text

from nltk.corpus import reuters 

#This reads in all documents and finds all unique words
Example #10
0
from nltk.tokenize import SpaceTokenizer

os.chdir("D:/R/BOA/txtfiles")
for fileName in glob.glob("*.txt"):
    count=0
    file = open('D:/R/BOA/txtfiles/'+fileName, 'r')
    filew = open('D:/R/BOA/Noun/'+fileName, "wb")
    for line in file:
                                    count=count+1
                                    print count
                                    print line
                                    line = re.sub('\\f', '', line)
                                    #line = line.decode("utf-8")
                                    line = unicode(line, errors='ignore')
                                    tokenizer = SpaceTokenizer()
                                    toks = tokenizer.tokenize(line)
                                    default_tagger = nltk.data.load(nltk.tag._POS_TAGGER)
                                    model = {'Consumer': 'RB'}
                                    tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger)
                                    #pos = pos_tag(toks)
                                    pos=tagger.tag(toks)
                                    print pos
                                    chunked_nes = ne_chunk(pos) 
                                    nes = [' '.join(map(lambda x: x[0], ne.leaves()))
                                           for ne in chunked_nes
                                                 if isinstance(ne, nltk.tree.Tree)]
                                            #data.append(nes)
                                    print nes 
                                    filew.write((','.join(nes))+'\n')
                                    #filew.write("yeah its me")
    
Example #11
0
        status = tokens[6]
        size = tokens[7][:-1]
#     print(ip + ' ' + date_time + ' ' + method + ' ' + url + ' ' + protocol + ' ' + status + ' ' + size)

    val = (ip, date_time, method, url, protocol, status, size)
    mycursor.execute(sql, val)


# Type 0 -> Tab Seperated (Server 1)
# Type 1 -> Space Seperated (Server 2)

log = readfile("access_log")

line = log.readline()
tk = SpaceTokenizer()
tokens = tk.tokenize(line)

while line:
    tokens = tk.tokenize(line)
    process(tokens)
    line = log.readline()

mydb.commit()
print("records inserted.")

# Top client ip addresses by number of requests
sql = "SELECT IP, count(*) FROM logs_c GROUP BY IP ORDER BY count(*) DESC LIMIT 5"
mycursor.execute(sql)
results = mycursor.fetchall()

for x in results:
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# @Time       : 2020/7/11 17:37
# @Author     : 代登辉
# @Email      : [email protected]
# @File       : tokenizer.py
# @Software   : PyCharm
# @Description: 分词

# 导入相应库
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

text = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \
       "loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. " \
       "\nAnd I will have my vengeance, in this life or the next. "
ITokenizer = LineTokenizer()
print("按照换行分词 ", ITokenizer.tokenize(text))

rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary."
sTokenizer = SpaceTokenizer()
print("按照空格符分词 :", sTokenizer.tokenize(rawText))  # 表达符号和单词连在一起
print("按照单词分词 :", word_tokenize(rawText))  # 表达符号和单词分开

tweet = "This is a cooool #dummysmiley: :-) :-P <3"
tTokenizer = TweetTokenizer()
print("处理特殊字符 ", tTokenizer.tokenize(tweet))
Example #13
0
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

line = "My name is Venkatram Veerareddy, technical architect.\n I am having 20 years of experience in "\
                          " Software industry working \nfrom applications to products by using \n" \
                          " C, C++, Java, Javascript and databases "\
                          " like Oracle, MS SQL Server, Postgres, MySQL and OrientDB."

lTokenizer = LineTokenizer()
print("Line tokenizer output: ", lTokenizer.tokenize(line))

sTokenizer = SpaceTokenizer()
print("Space Tokenizer output: ", sTokenizer.tokenize(line))

print("Word Tokenizer output: ", word_tokenize(line))

tTokenizer = TweetTokenizer()
print("Tweet Tokenizer output: ",
      tTokenizer.tokenize("This is a coooool #dummysmiley: :-) :-P <3"))
Example #14
0
import nltk.tag, nltk.data
from nltk.tokenize import SpaceTokenizer

with open("D:/R/email_Analysis/FINAL/pyhton_mssg.csv", "r") as csvfile:
    datareader = csv.reader(csvfile,
                            quotechar='"',
                            lineterminator='\n',
                            quoting=csv.QUOTE_ALL)
    csv_out = open('D:/R/email_Analysis/FINAL/Noun.csv.csv', 'wb')
    mywriter = csv.writer(csv_out)
    count = 0
    for row in datareader:
        count = count + 1
        print "COUNT is :%d" % count
        tokenizer = SpaceTokenizer()
        toks = tokenizer.tokenize((''.join(row)))
        default_tagger = nltk.data.load(nltk.tag._POS_TAGGER)
        model = {
            'Almost': 'RB',
            'shikha': 'NNP',
            'Lots': '',
            'bbnt': 'NNP',
            'Swati': 'NNP',
            'Sarkar': 'NNP',
            'Deepak': 'NNP',
            'Capgemini': 'NNP',
            'Swati': 'NNP',
            'Deepak Shete': 'NNP',
            'Melini': 'NNP',
            'Lots': 'RB',
            'Prashant Deshpande': 'NNP',
Example #15
0
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import stopwords
from functools import partial
from gensim import corpora
from gensim.models import TfidfModel
import re

# initialize the instances for various NLP tools
tokenizer = SpaceTokenizer()
stemmer = PorterStemmer()

# define each steps
pipeline = [
    lambda s: re.sub('[^\w\s]', '', s), lambda s: re.sub('[\d]', '', s),
    lambda s: s.lower(), lambda s: ' '.join(
        filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s))),
    lambda s: ' '.join(map(lambda t: stemmer.stem(t), tokenizer.tokenize(s)))
]

# function that carries out the pipeline step-by-step


def preprocess_text(text, pipeline):
    if len(pipeline) == 0:
        return text
    else:
        return preprocess_text(pipeline[0](text), pipeline[1:])


# preprocessing
preprocessed_texts = map(partial(preprocess_text, pipeline=pipeline), texts)
Example #16
0
def text_pre_processing(text,
                        remove_number=True,
                        stop_word=True,
                        stop_word_language='english',
                        remove_punctuation=True):
    # ---------------------------------------------
    # Patterns
    results_chunk = ''
    results_named_entitiy = ''

    patterns1 = r'@[A-Za-z0-9_]+'
    pattterns2 = r'https?://[^ ]+'
    combined_patterns = r'|'.join((patterns1, pattterns2))
    www_patterns = r'www.[^ ]+'
    negations_dic = {
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "wouldn't": "would not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "can't": "can not",
        "couldn't": "could not",
        "shouldn't": "should not",
        "mightn't": "might not",
        "mustn't": "must not"
    }
    negations_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) +
                                   r')\b')

    # ---------------------------------------------
    # convert to lower case
    results = str(text)

    # ---------------------------------------------
    # Text Cleaning
    results = re.sub(combined_patterns, '', results)
    results = re.sub(www_patterns, '', results)
    results = results.lower()
    results = negations_pattern.sub(lambda x: negations_dic[x.group()],
                                    results)
    results = re.sub("[^a-zA-Z]", " ", results)

    results = results.replace("(<br/>)", "")
    results = results.replace('(<a).*(>).*(</a>)', '')
    results = results.replace('(&amp)', '')
    results = results.replace('(&gt)', '')
    results = results.replace('(&lt)', '')
    results = results.replace('(\xa0)', ' ')

    # ---------------------------------------------
    if (remove_number) & (results != ''):
        results = re.sub(r'\d+', '', results)

    # ---------------------------------------------
    if remove_punctuation & (results != ''):
        translator = str.maketrans('', '', string.punctuation)
        results = results.translate(translator)

    # ---------------------------------------------
    # Remove whitespaces
    results = results.strip()

    # ---------------------------------------------
    # Line Tokenize
    if results != '':
        line_tokenizer = LineTokenizer()
        results = line_tokenizer.tokenize(results)
        results = list(filter(None, results))
        results = results[0]

    # ---------------------------------------------
    # Tab Tokenize
    if results != '':
        tab_tokenizer = TabTokenizer()
        results = tab_tokenizer.tokenize(results)
        results = list(filter(None, results))
        results = results[0]

    # ---------------------------------------------
    # Space Tokenizer
    if results != '':
        space_toknizer = SpaceTokenizer()
        results = space_toknizer.tokenize(results)
        results = list(filter(None, results))
        results = ' '.join([w for w in results])

    # -----------------------------------------------
    # Lemmatization using NLTK
    if results != '':
        lemmatizer_of_text = WordNetLemmatizer()
        word_list = word_tokenize(results)
        results = ' '.join([
            lemmatizer_of_text.lemmatize(w, get_word_net_pos_tag(w))
            for w in word_list
        ])

    # ---------------------------------------------
    # Stemming using NLTK
    if results != '':
        stemmer = PorterStemmer()
        if type(results) == list:
            results = ' '.join(str(w) for w in results)
        results = word_tokenize(str(results))
        results = [stemmer.stem(word) for word in results]
        results = ' '.join(str(w) for w in results)

    # ---------------------------------------------
    # Remove Stop Words
    if stop_word & (results != ''):
        nltk.download('stopwords')
        stop_words = set(stopwords.words(stop_word_language))
        word_tokens = word_tokenize(results)
        results = ' '.join(str(w) for w in word_tokens if not w in stop_words)

    # ---------------------------------------------
    # Chunking of the input, will be used ofr coloring of the text
    if results != '':
        result_str = TextBlob(results)
        reg_exp = 'NP: { < DT >? < JJ > * < NN >}'
        rp = nltk.RegexpParser(reg_exp)
        results_chunk = rp.parse(result_str.tags)
    # results_chunk.draw()

    # ---------------------------------------------
    # Named Entity Recognition
    if results != '':
        results_named_entitiy = ne_chunk(pos_tag(word_tokenize(results)))

    return results, results_chunk, results_named_entitiy
Example #17
0
from nltk import pos_tag, ne_chunk
import nltk.tag, nltk.data
from nltk.tokenize import SpaceTokenizer


with open("D:/R/email_Analysis/FINAL/pyhton_mssg.csv", "r") as csvfile:
        datareader = csv.reader(csvfile,quotechar='"' ,lineterminator='\n',quoting=csv.QUOTE_ALL)
        csv_out = open('D:/R/email_Analysis/FINAL/Noun.csv.csv', 'wb')
	mywriter = csv.writer(csv_out)
	count=0
	for row in datareader:
				count = count + 1
				print "COUNT is :%d" % count
				tokenizer = SpaceTokenizer()
                                toks = tokenizer.tokenize((''.join(row)))
				default_tagger = nltk.data.load(nltk.tag._POS_TAGGER)
				model = {'Almost': 'RB','shikha':'NNP','Lots':'','bbnt':'NNP','Swati':'NNP','Sarkar':'NNP','Deepak':'NNP','Capgemini':'NNP','Swati':'NNP','Deepak Shete':'NNP','Melini':'NNP','Lots':'RB','Prashant Deshpande':'NNP','Deepak A. Shete':'NNP','Rajesh Achyut Patankar':'NNP','Shailesh V. Naik':'NNP','Prashant':'NNP','Kuldeep Vishnu Deshpande':'NNP','Kuldeep Deshpande':'NNP','Hi':'UH','From':'IN','Subject':'VB','RE':'SYM','Cc':'SYM','CC':'SYM','Start':'RB','All':'RB','PLEASE':'RB','Request':'RB','Add':'RB','Need':'RB','Completed':'VB','To':'RB','Dear':'RB','Thank':'RB','You':'PRP','We':'PRP','Here':'RB','Team':'RB','Please':'UH','Thanks':'UH','Regards':'UH','See':'VB','Test':'VB','ASAP':'SYM','Sent':'VB','mailto':'SYM','Together':'RB','Is':'VB','AS':'RB','Financial Services Strategic Business Unit':'NNP','fax':'RB','mobile':'RB','except':'RB','date':'RB','new':'RB','courier':'RB','extn':'RB'}
				tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger)
                                #pos = pos_tag(toks)
				pos=tagger.tag(toks)
				print pos
                                chunked_nes = ne_chunk(pos) 

                                nes = [' '.join(map(lambda x: x[0], ne.leaves()))
                                       for ne in chunked_nes
                                             if isinstance(ne, nltk.tree.Tree)]
                                        #data.append(nes)
                                print nes
                                mywriter.writerow(nes)
                                
Example #18
0
from nltk import word_tokenize

# Line tokenizer
longSentence = 'My name is Maximus Decimus Meridius, Commander of the Armies '\
'of the North, General of the Felix Legions, loyal servant to '\
'the true emperor, Marcus Aurelius. Father to a murdered son, '\
'husband to a murdered wife. And I will have my vengeance, in '\
'this life or the next.'

lTokenizer = LineTokenizer()
sentenceTokens = lTokenizer.tokenize(longSentence)
print (sentenceTokens)

# Space tokenizer
sTokenizer = SpaceTokenizer()
spaceTokens = sTokenizer.tokenize(longSentence)
print (spaceTokens)

# Tweet tokenizer
tweet = 'This is a coool #dummysmiley: :-) :) :-P <3'
tTokenizer = TweetTokenizer()
tTokens = tTokenizer.tokenize(tweet)
print ('Tweet tokenizer outpur:')
print (tTokens)

# Word tokenizer
wTokenizer = word_tokenize(longSentence)
print (wTokenizer)

################
### Stemming ###
Example #19
0
indx ='\n'.join(res)
print ("\nThe sentences contaning '"+ inp +"'"+" are : \n" + indx)
#conversations containing input

con = re.findall(r'"(?:(?:(?!(?<!\\)").)*)"', str(res))
indx2 ='\n'.join(con)
print ("\nThe conversations contaning '"+ inp +"'"+" are : \n" + indx2)
#count of conversations
count = len(list(filter(lambda x: inp in x, con))) 
print ("\nThe count of conversations contaning '"+ inp +"'"+" are :\n"+str(count))
#All conversations in the excerpt
allconv = re.findall(r'"(.*?)"', str(token_text))
indx3 ='\n'.join(allconv)
print ("\nThe conversations in the excerpt are : \n" + indx3)

from nltk.tag import pos_tag
tagged_sent = pos_tag(text_string.split())
#propernouns = [word for word,pos in tagged_sent if pos == 'NNP']
#print( propernouns)

from nltk.tree import Tree
from nltk import pos_tag, ne_chunk
from nltk.tokenize import SpaceTokenizer
tokenizer = SpaceTokenizer()
toks = tokenizer.tokenize(text_string)
pos = pos_tag(toks)
chunked_nes = ne_chunk(pos) 
nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne,Tree)]
indx4 ='\n'.join(nes)
print("\n Proper nouns used in the excerpt are:\n", indx4)
Example #20
0
from nltk.tag import pos_tag
import nltk.tokenize
from nltk.corpus import cmudict
from wordgen import gen_word
from nltk import pos_tag, ne_chunk
from nltk.tokenize import SpaceTokenizer
sentence = "who is Mahatma Gandhi visiting I'm HIS PRETTY GIRLFRIEND a Denny's McDonalds in broad daylight Shtruus"
tokenizer = SpaceTokenizer()
toks = tokenizer.tokenize(sentence)
pos = pos_tag(toks)
chunked_nes = ne_chunk(pos) 
print chunked_nes
nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree)]

print nes
'''
qry = "who is Mahatma Gandhi"
tokens = nltk.tokenize.word_tokenize(qry)
pos = nltk.pos_tag(tokens)
sentt = nltk.ne_chunk(pos, binary = False)
print sentt
person = []
for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'):
    for leave in subtree.leaves():
        person.append(leave)
print "person=", person
    
   ''' 
    
'''
d = cmudict.dict()
Example #21
0
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

lTokenizer = LineTokenizer()
print(
    "Line tokenizer output :",
    lTokenizer.tokenize(
        "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. \nAnd I will have my vengeance, in this life or the next."
    ))

rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary."
sTokenizer = SpaceTokenizer()
print("Space Tokenizer output :", sTokenizer.tokenize(rawText))

print("Word Tokenizer output :", word_tokenize(rawText))

tTokenizer = TweetTokenizer()
print("Tweet Tokenizer output :",
      tTokenizer.tokenize("This is a cooool #dummysmiley: :-) :-P <3"))
Example #22
0
from nltk.tag import pos_tag
import nltk.tokenize
from nltk.corpus import cmudict
from wordgen import gen_word
from nltk import pos_tag, ne_chunk
from nltk.tokenize import SpaceTokenizer

sentence = "who is Mahatma Gandhi visiting I'm HIS PRETTY GIRLFRIEND a Denny's McDonalds in broad daylight Shtruus"
tokenizer = SpaceTokenizer()
toks = tokenizer.tokenize(sentence)
pos = pos_tag(toks)
chunked_nes = ne_chunk(pos)
print chunked_nes
nes = [
    ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes
    if isinstance(ne, nltk.tree.Tree)
]

print nes
'''
qry = "who is Mahatma Gandhi"
tokens = nltk.tokenize.word_tokenize(qry)
pos = nltk.pos_tag(tokens)
sentt = nltk.ne_chunk(pos, binary = False)
print sentt
person = []
for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'):
    for leave in subtree.leaves():
        person.append(leave)
print "person=", person
    
Example #23
0
from nltk.tokenize import SpaceTokenizer

os.chdir("D:/R/BOA/txtfiles")
for fileName in glob.glob("*.txt"):
    count = 0
    file = open('D:/R/BOA/txtfiles/' + fileName, 'r')
    filew = open('D:/R/BOA/Noun/' + fileName, "wb")
    for line in file:
        count = count + 1
        print count
        print line
        line = re.sub('\\f', '', line)
        #line = line.decode("utf-8")
        line = unicode(line, errors='ignore')
        tokenizer = SpaceTokenizer()
        toks = tokenizer.tokenize(line)
        default_tagger = nltk.data.load(nltk.tag._POS_TAGGER)
        model = {'Consumer': 'RB'}
        tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger)
        #pos = pos_tag(toks)
        pos = tagger.tag(toks)
        print pos
        chunked_nes = ne_chunk(pos)
        nes = [
            ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes
            if isinstance(ne, nltk.tree.Tree)
        ]
        #data.append(nes)
        print nes
        filew.write((','.join(nes)) + '\n')
        #filew.write("yeah its me")
tokenizer = SpaceTokenizer()
stop_words = set(stopwords.words("english"))
#adding more stop_words based on initial analysis
stop_words.update(['new', 'use', 'would', '-', 'using'])
#print stop_words

while (itr < 100):
    try:

        if (messages[itr][0:1] == "`"):
            itr += 1
            #print "code found at: " + itr
        else:
            lowercased = messages[itr].lower()
            lemmatized_lowercased = lemmatizer.lemmatize(lowercased)
            tokenized = tokenizer.tokenize(lowercased)
            filtered_sentence = [
                words for words in tokenized if not words in stop_words
            ]
            tokenized_stopless_messages.append(filtered_sentence)
            #print "filtered_sentence added"

            #tokenized_messages.append(tokenizer.tokenize(messages[itr]))
            itr += 1
            #print itr
    except TypeError:
        #print "Skipped"
        itr += 1
        #print itr

        #print itr
Example #25
0
wiki_files = [
    "soccer_teams_wiki/resources/wikipedia_corinthians.txt",
    "soccer_teams_wiki/resources/wikipedia_palmeiras.txt",
    "soccer_teams_wiki/resources/wikipedia_portuguesa.txt",
    "soccer_teams_wiki/resources/wikipedia_santos.txt",
    "soccer_teams_wiki/resources/wikipedia_sao_paulo.txt"
]

for file in wiki_files:
    with open(file, "r") as wiki_file:
        wiki_text = wiki_file.readlines()

    # TODO text cleanup. Remove stop words and other text treatment for articles
    for line in wiki_text:
        phrase = [
            word.lower() for word in tokenizer.tokenize(line)
            if word not in stop_words
        ]
        wiki_tokenized.append(phrase)

our_model = Word2Vec(wiki_tokenized,
                     size=10,
                     window=15,
                     min_count=1,
                     workers=4)

while True:
    query_word = input('Type Word: ')
    query_word = query_word.strip().lower()
    if our_model.__contains__(query_word):
        print(our_model.most_similar(query_word))
ml = len(messages)
print ml
#34467
itr = 0

tokenized_messages = []
tokenizer = SpaceTokenizer()

while (itr < 10):
    try:

        if (messages[itr][0:1] == "`"):
            itr += 1
            print "code found"
        else:
            tokenized_messages.append(tokenizer.tokenize(messages[itr]))
            itr += 1
            print itr
    except TypeError:
        print "Skipped"
        itr += 1
        print itr

        #print itr
        #error after 1741

print tokenized_messages[0]

eg_string = "This is a sample sentence, showing off the stop words filtration."
print eg_string[0:2]
# import all necessary libraries
from nltk.stem import PorterStemmer
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import stopwords
import re

# initialize the instances for various NLP tools
tokenizer = SpaceTokenizer()
stemmer = PorterStemmer()

# define each steps
pipeline1 = [lambda s: re.sub('[^\w\s]', '', s),     # remove special characters
             lambda s: re.sub('[\d]', '', s),        # remove numbers
             lambda s: s.lower(),                    # lower case
             lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s))),   # remove stop words
             lambda s: ' '.join(map(lambda t: stemmer.stem(t), tokenizer.tokenize(s)))   # stem (using Porter stemmer)
             ]
pipeline2 = [lambda s: re.sub('[^\w\s]', '', s),
             lambda s: re.sub('[\d]', '', s),
             lambda s: s.lower(),
             lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s)))
             ]
stopword_removal_pipeline = [lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s)))]

# pipeline handling
def preprocess_text(text, pipeline):
    return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:])
Example #28
0
REMOVE_PONCT = 3

LIST_OF_TASKS = [REMOVE_LINKS, REMOVE_ARROBA, REMOVE_PONCT]

#textMSG='testo com vários @davi @vito @maria https://www.google.com/ e links'
textMSG = 'será que vai cortar @davi . ; esse texto ? https://www.google.com vamos ver né?'
print(remove_ponctuation(textMSG))

print("Texto=", textMSG)

tk = SpaceTokenizer()
#s1 = tk.tokenize(textMSG)
#print(s1)

for task in LIST_OF_TASKS:
    s1 = tk.tokenize(textMSG)

    tam = len(s1)
    print("Num=", tam)

    #determine aqui o número de threads desejadas
    numthreads = 10
    pedaco = int(tam / numthreads)
    threads = []

    for i in range(0, numthreads):
        inicio = i * pedaco
        if i == numthreads - 1:
            fim = tam
        else:
            fim = (i + 1) * pedaco