Python WordPunctTokenizer Exemples, nltk.tokenize.WordPunctTokenizer Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : text_statistics.py Projet : aufziehvogel/sprakit

    def number_of_different_words(self):
        # TODO: Stemming, then move to language specific classes
        tokenizer = WordPunctTokenizer()
        words = tokenizer.tokenize(self.text.strip())
        only_textual_words = filter(unicode.isalpha, words)

        return len(set(only_textual_words))

Exemple #2

0

Afficher le fichier

Fichier : text.py Projet : davidmcclure/literary-interior

    def tokens(self):
        """Tokenize the text.
        """
        tokenizer = WordPunctTokenizer()

        # Get token character spans.
        spans = list(tokenizer.span_tokenize(self.text))

        # Materialize the token stream.
        tokens = [self.text[c1:c2] for c1, c2 in spans]

        tags = pos_tag(tokens)

        return [

            Token(
                token=token.lower(),
                char1=c1,
                char2=c2,
                pos=pos,
            )

            for (c1, c2), token, (_, pos) in
            zip(spans, tokens, tags)

        ]

Exemple #3

0

Afficher le fichier

Fichier : getNLindicators.py Projet : danzek/nlhbi-malware-extractor

def extract_nl_text(ms):
    """
    Extracts and tokenizes text from malware sample object

    :param ms: MalwareSample object
    :return: list of tokenized strings found in malware sample object's internal strings list
    """
    wpt = WordPunctTokenizer()
    all_tokenized_strings_in_ms = []
    inside_xml_privileges = False
    for s in ms.strings:
        if 'requestedPrivileges' in s or 'This program cannot be run in DOS mode' in s:
            continue
        elif inside_xml_privileges:
            continue
        elif '<assembly xmlns' in s:
            inside_xml_privileges = True
            continue
        elif '</assembly>' in s:
            inside_xml_privileges = False
            continue

        tokenized_string = []
        tokens = wpt.tokenize(s)
        if tokens:
            for t in tokens:
                if wordnet.synsets(t) and len(t) > 3:  # had to use length to eliminate false positives
                    tokenized_string.extend(tokens)
                    break
        if tokenized_string:
            all_tokenized_strings_in_ms.append(tokenized_string)
    return all_tokenized_strings_in_ms

Exemple #4

0

Afficher le fichier

Fichier : train_tensorflow.py Projet : denis-gordeev/CNN-aggression-RU

def message_to_wordlist(message, lemmas_bool, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    #review_text = BeautifulSoup(review).get_text()
    #
    # 2. Remove messages numbers
    message_text = re.sub(">>\d+","", message)
    message_text = message_text.lower()
    message_text = re.sub(u"ё", 'e', message_text, re.UNICODE)
    message_text = clean_str(message_text)
    tokenizer = WordPunctTokenizer()
    # 3. Convert words to lower case and split them
    words = tokenizer.tokenize(message_text)
    lemmas = []
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if lemmas_bool == 'l':
        for word in words:
            word_parsed = morph.parse(word)
            if len(word_parsed) > 0:
                lemmas.append(word_parsed[0].normal_form)
    elif lemmas_bool == 's':
        for word in words:
            word = stemmer.stem(word)
            if len(word) > 0:
                lemmas.append(word)
    else:
        lemmas = words
    # 5. Return a list of words
    return(lemmas)

Exemple #5

0

Afficher le fichier

Fichier : TextProcessor.py Projet : apsarath/pyNN

def TextProcessor(src, tgt, low=True, num=True):

    print "processing "+src
    if low==True:
        print "lowercasing.."
    if num==True:
        print "removing numeric.."

    srcfile = codecs.open(src,"r","utf-8")
    tgtfile = codecs.open(tgt,"w","utf-8")

    word_punct_tokenizer = WordPunctTokenizer()

    linecount=0
    for line in srcfile:
        linecount+=1
        line = word_punct_tokenizer.tokenize(line)
        if low==True:
            for i in range(0,len(line)):
                line[i] = line[i].lower()
        if num==True:
            for i in range(0,len(line)):
                if line[i].isnumeric()==True:
                    line[i] = "<number>"

        tgtfile.write(listtostring(line))

    srcfile.close()
    tgtfile.close()
    print "done processing "+str(linecount)+" lines!!"

Exemple #6

0

Afficher le fichier

Fichier : nltk_tokenizer.py Projet : paulzin/NltkTokenizerDemo

def tokenize_words(sentence):
    """
    :param sentence:
    :return: list of words in sentence
    """
    tokenizer = WordPunctTokenizer()
    return tokenizer.tokenize(sentence)

Exemple #7

0

Afficher le fichier

Fichier : flask1.py Projet : tempflip/szakdoga

def class1():
	import nltk
	from nltk.tokenize import WordPunctTokenizer
	docId = request.args.get('d')
	tokenizer = WordPunctTokenizer()		
	collection = initialize_collection('documents')

	featuresets = []
	tagSet = set()
	for d in collection.find():	
		bagOfWords = bag_of_words(tokenizer.tokenize(d['content']))
		if 'tags' not in d: continue
		for tag in d['tags']:
			featuresets.append((bagOfWords, tag))
			tagSet.add(tag)
	classifier = nltk.NaiveBayesClassifier.train(featuresets)

	d = collection.find_one({'_id' : ObjectId(docId)})

	#classifier.show_most_informative_features(100)
	cl = classifier.prob_classify(bag_of_words(tokenizer.tokenize(d['content'])))
	probs = []
	for tag in tagSet:
		probs.append((tag, round(cl.prob(tag)*100) ))
	classifier.show_most_informative_features(n=20)
	probs = sorted(probs, key = lambda x : x[1],  reverse = True)
	return render_template('class1.html', probs = probs, d=d)

Exemple #8

0

Afficher le fichier

Fichier : prepare_amazon_reviews.py Projet : BKJackson/txtnets

def clean_data(input_file_name, output_file_name):
    def clean_word(word):
        word = word.lower()
        word = word.replace('&amp;','&').replace('&lt;','<').replace('&gt;','>').replace('&quot;','"').replace('&#39;',"'")
        word = re.sub(r'(\S)\1+', r'\1\1', word)  # normalize repeated characters to two
        word = re.sub(r'(\S\S)\1+', r'\1\1', word)

        word = word.encode('ascii', 'ignore')

        if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None:
            word = 'GENERIC_HTTP'

        return word.encode('ascii', 'ignore')

    tokenizer = WordPunctTokenizer()

    with gzip.open(input_file_name) as input_file:
        with gzip.open(output_file_name, 'w') as output_file:
            for line in input_file:
                sentences, score = json.loads(line)
                cleaned_sentences = []
                for sentence in sentences:
                    cleaned_sentence = " ".join(map(clean_word, sentence.split()))
                    cleaned_sentences.append(tokenizer.tokenize(cleaned_sentence))

                json.dump([cleaned_sentences, score], output_file)
                output_file.write("\n")

Exemple #9

0

Afficher le fichier

Fichier : prepare_amazon_sentiment.py Projet : BKJackson/txtnets

def clean_data(input_file_name, output_file_name):
    def clean_word(word):
        word = word.encode('ascii', 'ignore')
        word = word.lower()
        word = re.sub(r'(\S)\1+', r'\1\1', word)  # normalize repeated characters to two
        word = re.sub(r'(\S\S)\1+', r'\1\1', word)

        if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None:
            word = 'GENERIC_HTTP'

        return word

    tokenizer = WordPunctTokenizer()
    data = []
    with open(input_file_name) as input_file:
        for sentences, label in json.load(input_file):
            cleaned_sentences = []
            for sentence in sentences:
                cleaned_sentence = " ".join(map(clean_word, sentence.split()))
                cleaned_sentence = tokenizer.tokenize(cleaned_sentence)
                cleaned_sentences.append(cleaned_sentence)

            data.append([cleaned_sentences, label])

    with codecs.open(output_file_name, 'w', encoding='utf-8') as output_file:
        json.dump(data, output_file)

Exemple #10

0

Afficher le fichier

Fichier : flask1.py Projet : tempflip/szakdoga

def tfIdf():
	TFIDF_MIN_SCORE = 100
	import nltk
	from nltk.tokenize import WordPunctTokenizer
	tokenizer = WordPunctTokenizer()		
	collection = initialize_collection('documents')

	docs = collection.find()
	tfidf = []
	idfMap = create_idf_map()
	docs = collection.find()
	for d in docs:
		tfMap = {}
		for word in set(tokenizer.tokenize(d['content'].lower())):
		 	if word not in tfMap:
		 		tfMap[word] = 1
		 	else:
		 		tfMap[word] += 1
		tfIdfValues = []
		for word in set(tokenizer.tokenize(d['content'].lower())):
			if (tfMap[word] * 1000 / idfMap[word]) > TFIDF_MIN_SCORE:
				tfIdfValues.append((word, tfMap[word] * 1000 / idfMap[word]))
		tfIdfValues = sorted(tfIdfValues, key = lambda x : x[1], reverse = True)
		d['tfidf'] = tfIdfValues
		tfidf.append({'d' : d,
					  'tfidf' : tfIdfValues})
		collection.save(d)


	genFreq = generaral_frequency(idfMap)
	return render_template("tfidf.html", documents = tfidf)

Exemple #11

0

Afficher le fichier

Fichier : utils.py Projet : davidmcclure/stacks

def tokenize(text):
    """Tokenize a raw text.

    Args:
        text (str)

    Returns: list of {token, char1, char2, pos}
    """
    tokenizer = WordPunctTokenizer()

    # Get token character spans.
    spans = list(tokenizer.span_tokenize(text))

    # Materialize the token stream.
    tokens = [text[c1:c2] for c1, c2 in spans]

    # Tag parts-of-speech.
    tags = pos_tag(tokens)

    return [

        dict(
            token=token.lower(),
            char1=c1,
            char2=c2,
            pos=pos,
        )

        for (c1, c2), token, (_, pos) in
        zip(spans, tokens, tags)

    ]

Exemple #12

0

Afficher le fichier

	def words(self, fileid=None):
		"""
		Returns all of the words and puncuation symbols in the specified file
		that were in 'section//p' text nodes.
		"""
		elt = self.xml(fileid).iterfind('.//section//p')
		word_tokenizer = WordPunctTokenizer()
		return [val for subl in [word_tokenizer.tokenize(nodetext) for nodetext in [''.join(el.itertext()) for el in elt]] for val in subl]

Exemple #13

0

Afficher le fichier

Fichier : text_analyser.py Projet : elangovana/Aristo

 def get_words_without_stopwords(self, text):
     stopwords = nltk.corpus.stopwords.words('english')
     stopwords.extend(string.punctuation)
     stopwords.append('')
     tokenizer = WordPunctTokenizer()
     tokens = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(text) \
               if token.lower().strip(string.punctuation) not in stopwords]
     return tokens

Exemple #14

0

Afficher le fichier

Fichier : crimeclassifier_v2.py Projet : teonghan/crimefeeder

def extract_words(text):
    stemmer = PorterStemmer()

    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)

    result =  [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
    return result

Exemple #15

0

Afficher le fichier

Fichier : seo.py Projet : blorenz/cms

def getBigram(haystack):
    tokenizer = WordPunctTokenizer()
    words = tokenizer.tokenize(haystack)
    bcf = BigramCollocationFinder.from_words(words)
    stopset = set(stopwords.words('english'))
    filter_stops = lambda w: len(w) < 3 or w in stopset
    bcf.apply_word_filter(filter_stops)

    return bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)

Exemple #16

0

Afficher le fichier

Fichier : nlp.py Projet : zweiss/RC_Readability_Calculator

def get_tokens(sentence):
    """
    Tokenizes a list of sentences
    :param sentence: list of sentences
    :return: list of tokenized sentences
    """

    tokenizer = WordPunctTokenizer()
    return tokenizer.tokenize(sentence)

Exemple #17

0

Afficher le fichier

Fichier : download_data2db.py Projet : Alenush/zhivoeslovo

def change_db2(text, origin_dict, id):
    print origin_dict
    tokens_ar = []
    word_punct_tokenizer = WordPunctTokenizer()
    for token in word_punct_tokenizer.span_tokenize(origin_dict):
        tokens_ar.append(token)
    for line in text.split("\n"):
        markup_error_line = line.split(';')
        print "MARKUP", markup_error_line
        convert_coord_2dbformat(markup_error_line, tokens_ar, id)

Exemple #18

0

Afficher le fichier

Fichier : gensimtest.py Projet : Derenas/Master-Thesis

def tokenize(text): 
	tokens = tokenizer.tokenize(text)
	wordtokenizer = WordPunctTokenizer()
	wlist =[]
	for token in tokens:
		wtoken = wordtokenizer.tokenize(token)
		wlist = wlist+wtoken

	stems = stem_tokens(wlist, stemmer)
	return stems

Exemple #19

0

Afficher le fichier

Fichier : classifier_nb.py Projet : beeblook/hotel-reviews-sentiment-ranking-analysis

def extract_words(text):
 stemmer = PorterStemmer()
 tokenizer = WordPunctTokenizer()
 tokens = tokenizer.tokenize(text)
 bigram_finder = BigramCollocationFinder.from_words(tokens)
 bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
 for bigram_tuple in bigrams:
  x = "%s %s" % bigram_tuple
  tokens.append(x)
 result =  [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
 return result

Exemple #20

0

Afficher le fichier

Fichier : ePortfolio.py Projet : rmomizo/RePort_Bot

def you_collocations(raw):

    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(raw)

    bigrams = [(tokens[i], tokens[i +1]) for i in range(len(tokens)-1)]
    collocations = [(t1, t2) for (t1, t2) in bigrams if t1 == "you" or t1 == 'your']

    trigrams = [(tokens[i], tokens[i +1], tokens[i+2]) for i in range(len(tokens)-2)]
    trilocations = [(t1, t2, t3) for (t1, t2, t3) in trigrams if t1 == "you" or t1 == 'your']

    return collocations, trilocations

Exemple #21

0

Afficher le fichier

Fichier : ml_util.py Projet : ajaybhat/email-categorization

def extract_bigrams(text):
    text = remove_stopwords(text)
    tokenizer = WordPunctTokenizer()
    tokens = [token for token in set(tokenizer.tokenize(text)) if
              not is_number(token) and (is_valid_token(token) or is_name(token))]
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.dice, 500)
    for bigram_tuple in bigrams:
        x = "%s %s" % bigram_tuple
        tokens.append(x)
    result = [x.lower() for x in tokens if x not in stopwords.words("english") and len(x) > 3]
    return result

Exemple #22

0

Afficher le fichier

Fichier : tfidf.py Projet : cmben/astweetsanalysis

def get_bigrams(text):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
    result = []
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)

    for bigram_tuple in bigrams:
        x = "%s %s" % bigram_tuple
        tokens.append(x)

    return tokens

Exemple #23

0

Afficher le fichier

Fichier : prepare_nips.py Projet : BKJackson/txtnets

def build_word_dictionary(input_file_name, output_file_name):
    dictionary = Counter()
    tokenizer = WordPunctTokenizer()
    with open(input_file_name) as input_file:
        for record in json.loads(input_file.read()):
            dictionary.update(tokenizer.tokenize(record['content']))
            dictionary.update(tokenizer.tokenize(record['abstract']))

    dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN']

    with open(output_file_name, 'w') as output_file:
        output_file.write("{}\n".format(json.dumps(dictionary)))

Exemple #24

0

Afficher le fichier

Fichier : mouse2.py Projet : shubh24/RealDataChallenge

def extract_words(text):
 	
    stemmer = PorterStemmer()
    if type(text) == str:
        text = unicode(text, "utf-8", errors="ignore")
    else:
        text = unicode(text)
     
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
 
    result =  [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
    return result

Exemple #25

0

Afficher le fichier

Fichier : utils.py Projet : Kellar2010/tweet-spirit

def analyze(tweets):
    classifier = cache.get('classifier')
    if classifier is None:
        classifier = train_classifier()
        cache.set('classifier', classifier, None)
    tokenizer = WordPunctTokenizer()
    analyzed_tweets = []
    for tweet in tweets:
        tokens = tokenizer.tokenize(tweet.lower())
        featureset = word_feats(tokens)
        sentiment = classifier.prob_classify(featureset)
        analyzed_tweets.append(AnalyzedTweet(tweet, round(sentiment.prob('pos'),2), round(sentiment.prob('neg'),2)))
    return analyzed_tweets

Exemple #26

0

Afficher le fichier

Fichier : prepare_sentiment140.py Projet : BKJackson/txtnets

def build_word_dictionary(input_file_name, output_file_name):
    dictionary = Counter()
    with open(input_file_name) as input_file:
        for line in json.loads(input_file.read()):
            text, label = line
            tokenizer = WordPunctTokenizer()
            dictionary.update(tokenizer.tokenize(text))

    dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN']
    # dictionary = list(sorted(w for w,c in dictionary.most_common(3000))) + ['PADDING', 'UNKNOWN']

    with open(output_file_name, 'w') as output_file:
        output_file.write("{}\n".format(json.dumps(dictionary)))

Exemple #27

0

Afficher le fichier

Fichier : prepare_tweets.py Projet : wolfhu/txtnets

def build_word_dictionary(input_file_name, output_file_name):
    dictionary = Counter()
    with gzip.open(input_file_name) as input_file:
        for line in json.loads(input_file.read()):
            text, label = line
            # dictionary.update(text.split())
            tokenizer = WordPunctTokenizer()
            dictionary.update(tokenizer.tokenize(text))

    dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 3)) + ["PADDING", "UNKNOWN"]

    with open(output_file_name, "w") as output_file:
        output_file.write("{}\n".format(json.dumps(dictionary)))

Exemple #28

0

Afficher le fichier

Fichier : Keyword_Extraction.py Projet : dahoreshekhar/KeywordExtraction-TF-IDF

def OnButtonClick ():
        file = tkFileDialog.askopenfile(parent=root,mode='rb',title='Select a file')
        if file != None:
            print "Initializing... Please Wait"
            ini_db()
            
            file_list=file.readlines()

            for line in file_list:
                
                line=line.strip()
                fp1=open(line,"r")
                document_count()
                text=fp1.read()    
                #dictonary to store word frequency in text(temporary)
                doc_word_freq={}
                #Tokenize 
                from nltk.tokenize import WordPunctTokenizer
                tokenizer = WordPunctTokenizer()
                text2=tokenizer.tokenize(text)
            



                #removing stopwords
                from nltk.corpus import stopwords
                eng_stop=set(stopwords.words('english'))
                text3=[word for word in text2 if word not in eng_stop]

                #pos tag
                import nltk
                text4=nltk.pos_tag(text3)
                text5=filter_for_tags(text4)


                #calculate frequency of word in the text
                for word in text5:
                    if word in doc_word_freq:
                        doc_word_freq[word] += 1
                    else:
                        if(word != "'"):
                            doc_word_freq[word] = 1

                #update occurance of word in global table
                for (word,freq) in doc_word_freq.items():
                    if (check(word)):
                        update_record(word)
                    else:
                        add_new_word(word)
            print "Initialization Done...\n\n"
            file.close()

Exemple #29

0

Afficher le fichier

Fichier : ace2bio.py Projet : yaocheng-cs/misc

def convert(sgm_path, apf_path, bio_path=None):
    xml_parser = etree.XMLParser(recover=True)
    try:
        sgm_tree = etree.parse(sgm_path, xml_parser)
        apf_tree = etree.parse(apf_path, xml_parser)
        if not bio_path:
            bio_path = os.path.commonprefix([sgm_path, apf_path]) + 'bio'
        output = open(bio_path, 'w')
    except:
        print 'Something wrong when opening/parsing xml file, or opening output file'
        return
    
    init_offset = get_init_offset(sgm_path)
    text = sgm_tree.xpath('/DOC/BODY/TEXT')[0].text.strip('\n')
    
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
    spans = list(tokenizer.span_tokenize(text))
    pos = pos_tag(tokens)
    
    ts = []
    for i in range(len(tokens)):
        t = token()
        t.text = tokens[i]
        t.pos = pos[i][1]
        t.span = (spans[i][0] + init_offset, spans[i][1] - 1 + init_offset)
        t.bio = 'O'
        ts.append(t)
        
    entits = apf_tree.xpath('/source_file/document/entity')
    for enty in entits:
        enty_type = enty.get('TYPE')
        mentions = enty.xpath('entity_mention')
        for m in mentions:
            head = m.xpath('head')[0]
            span = (int(head[0].get('START')), int(head[0].get('END')))
            found = False
            for t in ts:
                if t.span[0] == span[0]:
                    t.bio = 'B-' + enty_type
                    found = True
                if t.span[0] > span[0] and t.span[1] <= span[1]:
                    t.bio = 'I-' + enty_type
                    found = True
            if not found:
                print 'entity mention head span not found', span, apf_path
    
    for t in ts:
        #print t.text, t.span
        output.write('\t'.join([t.text, t.pos, t.bio]) + '\n')
    output.close()

Exemple #30

0

Afficher le fichier

Fichier : NLP_PT.py Projet : fabiodomingos/EADW

 def word_tokenizePT(self,  text, tokenizer):
     """ tokenize a portuguese sentence in words
     @input params: sentence - a sentence, a phrase (self)
                    tokenizer - "TB" for TreebankWordTokenizer
                                "WP" for WordPunctTokenizer
     @returns word's list or error """
     if tokenizer == "TB":
         tokenizerTB = TreebankWordTokenizer()
         return tokenizerTB.tokenize(text)
     elif tokenizer == "WP":
         tokenizerWP = WordPunctTokenizer()
         return tokenizerWP.tokenize(text)
     else:
         return "tokenizer error: not found"

Exemple #31

0

Afficher le fichier

	def wordtokenizer(sentence):
	    words=WordPunctTokenizer().tokenize(sentence)
	    return words

Exemple #32

0

Afficher le fichier

Fichier : w2vModel.py Projet : piekey1994/IOM

 def __init__(self):
     self.tokenizer = WordPunctTokenizer()
     #加载模型word2Vec
     self.word2VecModel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

Exemple #33

0

Afficher le fichier

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer

tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not","weren't":"were not",
"haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not", "wouldn't":"would not", 
"don't":"do not", "doesn't":"doesnot","didn't":"did not", "can't":"can not","couldn't":"could not",
"shouldn't":"should not","mightn't":"might not", "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner(text):
  soup = BeautifulSoup(text, 'lxml')
  souped = soup.get_text()
    try:
      bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
      bom_removed = souped
  stripped = re.sub(combined_pat, '', bom_removed)
  stripped = re.sub(www_pat, '', stripped)
  lower_case = stripped.lower()
  neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
  letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
  words = [x for x in tok.tokenize(letters_only) if len(x)>1]

Exemple #34

0

Afficher le fichier

Fichier : countTokens.py Projet : dennybasillie/CZ4045_NLP

def main():
    tokenizer = WordPunctTokenizer()

    posts = []

    # 'questions-textonly.txt'
    with open(sys.argv[1], 'r') as f:
        for line in f:
            line = line[:-1]
            posts.append(line)

    # 'answers-textonly.txt'
    with open(sys.argv[2], 'r') as f:
        for line in f:
            line = line[:-1]
            posts.append(line)

    #print(len(posts))

    posts_lengths = []

    for post in posts:
        tokens = tokenizer.tokenize(post)
        tokenCount = len(tokens)
        posts_lengths.append(tokenCount)

    #print(len(posts_lengths))

    posts_lengths.sort()

    posts_lengths_unique = set(posts_lengths)

    #print(len(posts_lengths_unique))

    posts_lengths_unique_list = list(posts_lengths_unique)

    posts_lengths_count = []

    prevCount = posts_lengths[0]
    currCount = posts_lengths[0]
    n = 0

    for i in range(len(posts_lengths)):
        currCount = posts_lengths[i]
        if (currCount == prevCount):
            n += 1
        else:
            posts_lengths_count.append(n)
            n = 1
        prevCount = currCount

    posts_lengths_count.append(n)

    #print(len(posts_lengths_count))

    #posts_lengths_unique_list.index(21)
    #posts_lengths_unique_list.index(101)
    #posts_lengths_unique_list.index(502)

    posts_lengths_counts = np.array(posts_lengths_count)
    posts_lengths_Counts = np.array([])
    posts_lengths_Counts = np.append(posts_lengths_Counts,
                                     posts_lengths_counts[0])
    posts_lengths_Counts = np.append(posts_lengths_Counts,
                                     posts_lengths_counts[1:21].sum())
    posts_lengths_Counts = np.append(posts_lengths_Counts,
                                     posts_lengths_counts[21:101].sum())
    posts_lengths_Counts = np.append(posts_lengths_Counts,
                                     posts_lengths_counts[101:387].sum())
    posts_lengths_Counts = np.append(posts_lengths_Counts,
                                     posts_lengths_counts[387:].sum())

    posts_labels = np.char.array(['0', '1~20', '21~100', '101~500', '>500'])
    percents = 100. * posts_lengths_Counts / posts_lengths_Counts.sum()
    labels = [
        '{0} : {1:1.2f} % '.format(label, percentage)
        for label, percentage in zip(posts_labels, percents)
    ]

    patches, texts = plt.pie(posts_lengths_Counts, shadow=True, startangle=90)
    plt.legend(patches,
               labels,
               bbox_to_anchor=(0.2, 0.27),
               loc=1,
               fontsize='medium',
               borderaxespad=1.0)
    plt.title('Distribution of posts having X number of tokens')
    plt.savefig('posts_distribution.png')
    plt.show()

Exemple #35

0

Afficher le fichier

import flair
import torch
from flair.models import SequenceTagger
from nltk.tokenize import WordPunctTokenizer, PunktSentenceTokenizer

flair.device = torch.device('cpu')

word_tokenizer = WordPunctTokenizer()
tagger = SequenceTagger.load('fr-ner')
sent_tokenizer = PunktSentenceTokenizer(
    "nltk_data/tokenizers/punkt/french.pickle")

Exemple #36

0

Afficher le fichier

Fichier : tfidf_pipeline.py Projet : zzycs/WhoTweetedThat

class Pipeline:
    def __init__(self):
        self.tokenizer = WordPunctTokenizer()
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 1),
                                          max_features=50000)
        # self.classifier = LinearSVC(random_state=seed)
        self.classifier = LogisticRegression(random_state=seed,
                                             multi_class='multinomial')
        # self.classifier = RidgeClassifier(random_state=seed)
        # self.classifier = KNeighborsClassifier(n_jobs=4, n_neighbors=1)
        # self.classifier = KNeighborsClassifier(n_jobs=4, n_neighbors=3)
        # self.classifier = KNeighborsClassifier(n_jobs=4, n_neighbors=5)
        self.classifier = Perceptron(random_state=seed)
        # Raw file
        self.train_file = "raw/train_tweets.txt"
        self.test_file = "raw/test_tweets_unlabeled.txt"
        # Cleaned file
        self.train_file_cleaned = "data/train_tweets_cleaned.txt"
        self.test_file_cleaned = "data/test_tweets_cleaned.txt"
        self.total_file_cleaned = "data/total_tweets_cleaned.txt"
        # Vector File
        self.train_vector = "vector/train.vec"
        self.test_vector = "vector/test.vec"
        # Label File
        self.train_label = "label/train_label.txt"
        self.test_label = "label/test_label.csv"

    def tokenize(self):
        print("Tokenizing...")
        train_file_cleaned = open(self.train_file_cleaned, 'w')
        test_file_cleaned = open(self.test_file_cleaned, 'w')
        total_file_cleaned = open(self.total_file_cleaned, 'w')
        train_label = open(self.train_label, 'w')
        with open(self.train_file) as train_data:
            for line in train_data:
                label, tweet = line.strip().split('\t', 1)[:2]
                train_label.write(label + '\n')
                tokenized_tweet = " ".join(self.tokenizer.tokenize(tweet))
                train_file_cleaned.write(tokenized_tweet + '\n')
                total_file_cleaned.write(tokenized_tweet + '\n')
        with open(self.test_file) as test_data:
            for line in test_data:
                tokenized_tweet = " ".join(self.tokenizer.tokenize(line))
                test_file_cleaned.write(tokenized_tweet + '\n')
                total_file_cleaned.write(tokenized_tweet + '\n')

    def vectorize(self):
        print("Fitting vectorizer...")
        self.vectorizer.fit(open(self.total_file_cleaned))
        print("Vectorizing train file...")
        train_vector = self.vectorizer.transform(open(self.train_file_cleaned))
        print("Train vector: ", train_vector.shape)
        print("Vectorizing test file...")
        test_vector = self.vectorizer.transform(open(self.test_file_cleaned))
        print("Test vector: ", test_vector.shape)
        print("Saving...")
        pickle.dump(train_vector, open(self.train_vector, 'wb'))
        pickle.dump(test_vector, open(self.test_vector, 'wb'))

    def evaluate(self):
        train_vector = pickle.load(open(self.train_vector, 'rb'))
        train_label = []
        with open(self.train_label) as file:
            for line in file:
                train_label.append(int(line))
        print("Total Data: ", train_vector.shape)
        X_train, X_evl, y_train, y_evl = train_test_split(train_vector,
                                                          train_label,
                                                          test_size=0.5,
                                                          random_state=seed)
        _, X_train, _, y_train = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.1,
                                                  random_state=seed)
        _, X_evl, _, y_evl = train_test_split(X_evl,
                                              y_evl,
                                              test_size=0.1,
                                              random_state=seed)
        print(
            "Training set has {} instances. Test set has {} instances.".format(
                X_train.shape[0], X_evl.shape[0]))
        start = time.time()
        print("Training Classifier...")
        self.classifier.fit(X_train, y_train)
        pred_labels = self.classifier.predict(X_evl)
        print("Training successfully in %s seconds " %
              int(time.time() - start))
        print("Evaluate Accuracy: %0.2f" %
              (accuracy_score(y_evl, pred_labels) * 100))

    def classify(self):
        train_vector = pickle.load(open(self.train_vector, 'rb'))
        train_label = []
        with open(self.train_label) as file:
            for line in file:
                train_label.append(int(line))
        print("Total Data: ", train_vector.shape)
        start = time.time()
        print("Training Classifier...")
        self.classifier.fit(train_vector, train_label)
        print("Training successfully in %s seconds " %
              int(time.time() - start))
        print("Predicting...")
        test_vector = pickle.load(open(self.test_vector, 'rb'))
        test_label = self.classifier.predict(test_vector)
        df = pd.DataFrame(test_label, columns=['Predicted'])
        df.index += 1
        df.index.name = 'Id'
        df.to_csv(self.test_label)

Exemple #37

0

Afficher le fichier

from polyglot.text import Text
import nltk
import re
import spacy
import pymongo
from tqdm import tqdm
import json
from joblib import Parallel, delayed
from fuzzywuzzy import process
import gensim
import threading
import gensim.corpora as corpora
import pandas as pd
import config

tok = WordPunctTokenizer()
sent_detector = nltk.tokenize.punkt.PunktSentenceTokenizer()

nlp = spacy.load('en_core_web_sm')
nlp_spacy = spacy.load('en_core_web_sm')
from config import nlp_corenlp

nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


class AsyncNLPProcess(threading.Thread):
    def __init__(self, Task_Complete):
        super().__init__()
        self.Task_Complete = Task_Complete

Exemple #38

0

Afficher le fichier

Fichier : main.py Projet : selimfirat/turkish-news-analytics

def run():
    # Sentences From Text
    _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle")
    word_tokenizer = WordPunctTokenizer()
    abbreviations = set()
    with open("./tokenizer/abbreviations-long.txt") as f:
        for l in f:
            abbreviations.add(l.split(':')[0])

    _sentence_tokenizer._params.abbrev_types = abbreviations

    def sentences_from_text(text):
        return _sentence_tokenizer.tokenize(text.strip())

    def tokens_from_sentence(sentence):
        return sentence.split()  # nltk.word_tokenize(sentence)

    def ngrams(obj, n):
        tokens = []
        sentences = (sentences_from_text(obj["title"]) +
                     sentences_from_text(obj["description"]) +
                     sentences_from_text(obj["content"]))

        for sentence in sentences:
            tokens += tokens_from_sentence(sentence)

        pairs = nltk.ngrams(tokens, n)
        return [" ".join(pair) for pair in pairs]

    def convertToJsonObj(jsonText):
        return simplejson.loads(jsonText)

    def convertToObject(jsonObj):
        x = jsonObj

        obj = {
            "title":
            x.get("properties", {}).get("title", {}).get("stringValue", ""),
            "link":
            x.get("properties", {}).get("link", {}).get("stringValue", ""),
            "published":
            x.get("properties", {}).get("published",
                                        {}).get("stringValue", ""),
            "description":
            x.get("properties", {}).get("description",
                                        {}).get("stringValue", ""),
            "content":
            x.get("properties", {}).get("content", {}).get("stringValue", ""),
        }

        obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4())

        return obj

    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    def cleanhtml(raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    def removeHTMLFromStrings(obj):
        for key in obj.keys():
            obj[key] = cleanhtml(obj[key])

        return obj

    def tokenize_to_sentences(obj):

        obj["sentences"] = (sentences_from_text(obj["title"]) +
                            sentences_from_text(obj["description"]) +
                            sentences_from_text(obj["content"]))

        return obj

    def tokenize_to_words(obj):

        obj["tokens"] = []

        for sentence in obj["sentences"]:
            obj["tokens"] += tokens_from_sentence(sentence)

        for token in obj["tokens"]:
            yield (obj["key"], token)

    def get_named_entities(mdl, tokens):
        stemmer = TurkishStemmer()
        res = mdl.analyze(tokens)
        entities = []
        for entity in res["entities"]:
            for entity2 in entity["text"].split(", "):
                ne = stemmer.stem(entity2).split("'")[0]
                entities.append((entity["type"], ne, entity["score"]))
        return entities

    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'

    p = beam.Pipeline(options=options)

    pairs = (
        p
        | "Read From Text" >>
        ReadFromText("news.json",
                     coder=beam.coders.coders.StrUtf8Coder())  # line by line
        | "Convert to Json Object" >> beam.Map(convertToJsonObj)
        | "Convert to Python Object" >> beam.Map(convertToObject)
        | "Remove HTML Tags From Strings (Normalization 1)" >>
        beam.Map(removeHTMLFromStrings))

    tokens_1gram = (
        pairs
        | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences)
        | 'Word Tokenization' >> beam.FlatMap(
            tokenize_to_words)  # also convert to key value pairs
    )

    tokens = tokens_1gram

    def process_tokens_last(doc, tokens):
        return (doc, get_named_entities(tokens))

    doc_named_entities = (
        tokens
        | beam.GroupByKey()
        #     | beam.Map(lambda (doc, tokens): process_tokens_last(mdl, tokens))
    )

    (doc_named_entities | "Write Results" >> WriteToText("doc_tokens"))

    p.run()

Exemple #39

0

Afficher le fichier

Fichier : tokenize_ex.py Projet : minpyohong/MasterThesis

from nltk.tag import pos_tag
from konlpy.tag import Okt, Kkma

tokenizer = TreebankWordTokenizer()

okt = Okt()

kkma = Kkma()

print(
    word_tokenize(
        "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
    ))  ## 모두 token 화

print(WordPunctTokenizer().tokenize(
    "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
))  # ''' 단위는 띄움

print(
    text_to_word_sequence(
        "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
    ))  #"don't 는 하나로 인식

text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."  # home-based 하나로 인식, does n't로 인식 --> 일반 word tokenizer와 동일

print(tokenizer.tokenize(text))

sentence = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near."
print(sent_tokenize(sentence))

korean_sentence = "딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요. 이제 해보면 알걸요?"

Exemple #40

0

Afficher le fichier

Fichier : GetTFKeyWordsFromContent.py Projet : YuanMa2018/SparkIdea

 def get_tokensForBigData(self,text):
     tokens = WordPunctTokenizer().tokenize(text)
     words = [x for x in tokens if x not in string.punctuation
              and x not in ['.','."', '".', '?"', '!"', '%"', '%.','@']]
     return words

Exemple #41

0

Afficher le fichier

Fichier : train_tweet_classifier_epochwise.py Projet : wooginawunan/txtnets

__author__ = 'mdenil'

Exemple #42

0

Afficher le fichier

Fichier : GetTFKeyWordsFromContent.py Projet : YuanMa2018/SparkIdea

 def get_tokens(self,text):
     textLow = text.lower()
     tokens = WordPunctTokenizer().tokenize(textLow)
     words = [x for x in tokens if x not in string.punctuation and x not in ['."', '".', '?"', '!"', '%"', '%.']]
     return words

Exemple #43

0

Afficher le fichier

Fichier : test_api_class.py Projet : CobaiN-why/BIKER

    open('../data/parent', 'rb')
)  # parent is a dict(), which stores the ids of each query's duplicate questions

querys = read_data.methods_to_classes(read_data.read_querys_from_file())
#querys = querys[0:100]
print 'loading data finished'

mrr = 0.0
map = 0.0

for item in querys:

    query = item[0]
    true_apis = item[1]

    query_words = WordPunctTokenizer().tokenize(query.lower())
    query_words = [
        SnowballStemmer('english').stem(word) for word in query_words
    ]

    query_matrix = similarity.init_doc_matrix(query_words, w2v)
    query_idf_vector = similarity.init_doc_idf_vector(query_words, idf)

    top_questions = recommendation.get_topk_questions(query, query_matrix,
                                                      query_idf_vector,
                                                      questions, 50, parent)
    recommended_api = recommendation.recommend_api_class(
        query_matrix, query_idf_vector, top_questions, questions, javadoc,
        javadoc_dict_classes, -1)
    #recommended_api = recommendation.recommend_api_class_baseline(query_matrix,query_idf_vector,javadoc,-1)

Exemple #44

0

Afficher le fichier

Fichier : extract_LIWC_from_slices_general.py Projet : shubhampachori12110095/a-viz-of-ice-and-fire

            re.compile('^' + l.strip() + '$')
            for l in open(os.path.join(LIWC_dir, '%s' % (c)), 'r')
            if l.strip() not in stopwords
        ]
        for c in LIWC_categories
    }
    # replace positive/negative affect
    LIWC_categories += ['positive', 'negative']
    LIWC_categories.remove('positive_affect')
    LIWC_categories.remove('negative_affect')
    LIWC_category_wordlists['positive'] = LIWC_category_wordlists.pop(
        'positive_affect')
    LIWC_category_wordlists['negative'] = LIWC_category_wordlists.pop(
        'negative_affect')

    TKNZR = WordPunctTokenizer()
    full_slice_list = set(range(N_SLICES))
    # we count either the total number of tokens
    # or the number of unique tokens
    # count_option = 'total'
    count_option = 'unique'
    data = pd.read_csv(sub_file, sep='\t', index_col=False)
    data.sort_values('slice', ascending=True)
    fname = os.path.basename(sub_file).replace('.tsv', '')
    out_dir = os.path.dirname(sub_file)
    empty_slices = full_slice_list - set(data['slice'].unique())
    if (len(empty_slices) > 0):
        print('filling %s with empty slices %s' % (e_name, empty_slices))
        empty_slice_rows = pd.DataFrame([{
            'slice': c,
            'dialogue': ''

Exemple #45

0

Afficher le fichier

def run():
    import pickle
    import sys

    import math

    import numpy as np
    import apache_beam as beam

    reload(sys)
    sys.setdefaultencoding('utf8')

    import argparse
    import simplejson
    from gensim.models import KeyedVectors

    from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
    from apache_beam.io.textio import ReadFromText, WriteToText
    import nltk.data
    from nltk.tokenize import WordPunctTokenizer
    import re
    import uuid
    import perceptron
    # Sentences From Text
    _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle")
    word_tokenizer = WordPunctTokenizer()
    abbreviations = set()
    with open("./tokenizer/abbreviations-long.txt") as f:
        for l in f:
            abbreviations.add(l.split(':')[0])

    _sentence_tokenizer._params.abbrev_types = abbreviations

    model_file = "perceptron_word2vec_stemmed_normalized.pickle"
    with open(model_file, 'rb') as model:
        w, b = pickle.load(model)

    def sentences_from_text(text):
        return _sentence_tokenizer.tokenize(text.strip())

    def tokens_from_sentence(sentence):
        return nltk.word_tokenize(sentence)

    def ngrams(obj, n):
        tokens = []
        sentences = (sentences_from_text(obj["title"]) +
                     sentences_from_text(obj["description"]) +
                     sentences_from_text(obj["content"]))

        for sentence in sentences:
            tokens += tokens_from_sentence(sentence)

        pairs = nltk.ngrams(tokens, n)
        return [" ".join(pair) for pair in pairs]

    def convertToJsonObj(jsonText):
        return simplejson.loads(jsonText)

    def convertToObject(jsonObj):
        x = jsonObj

        obj = {
            "title":
            x.get("properties", {}).get("title", {}).get("stringValue", ""),
            "link":
            x.get("properties", {}).get("link", {}).get("stringValue", ""),
            "published":
            x.get("properties", {}).get("published",
                                        {}).get("stringValue", ""),
            "description":
            x.get("properties", {}).get("description",
                                        {}).get("stringValue", ""),
            "content":
            x.get("properties", {}).get("content", {}).get("stringValue", ""),
        }

        obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4())

        return obj

    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    def cleanhtml(raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    def removeHTMLFromStrings(obj):
        for key in obj.keys():
            obj[key] = cleanhtml(obj[key])

        return obj

    def tokenize_to_sentences(obj):

        obj["sentences"] = (sentences_from_text(obj["title"]) +
                            sentences_from_text(obj["description"]) +
                            sentences_from_text(obj["content"]))

        return obj

    def tokenize_to_words(obj):

        obj["tokens"] = []

        for sentence in obj["sentences"]:
            obj["tokens"] += tokens_from_sentence(sentence)

        for token in obj["tokens"]:
            yield (obj["key"], token)

    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'

    p = beam.Pipeline(options=options)

    pairs = (
        p
        | "Read From Text" >>
        ReadFromText("news.json",
                     coder=beam.coders.coders.StrUtf8Coder())  # line by line
        | "Convert to Json Object" >> beam.Map(convertToJsonObj)
        | "Convert to Python Object" >> beam.Map(convertToObject)
        | "Remove HTML Tags From Strings (Normalization 1)" >>
        beam.Map(removeHTMLFromStrings))

    tokens_1gram = (
        pairs
        | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences)
        | 'Word Tokenization' >> beam.FlatMap(
            tokenize_to_words)  # also convert to key value pairs
    )
    """
    tokens_2gram = (pairs
            | "Create 2-grams" >> beam.FlatMap(lambda obj: [(obj["key"], token) for token in ngrams(obj, 2)])
        )
    """

    tokens = tokens_1gram
    """
    vocabulary = (tokens
            | "Get words only" >> beam.Values()
            | "Remove duplicate words" >> beam.RemoveDuplicates()
        )
    vocabulary_size = (vocabulary
            | "Count Vocabulary elements" >> beam.combiners.Count.Globally()
        )

    doc_total_words = (tokens
            | "Count Words of Doc" >> beam.combiners.Count.PerKey()
    )
    """

    tokens_paired_with_1 = (
        tokens
        | "Pair with 1" >> beam.Map(lambda (doc, token): ((doc, token), 1)))
    """
    token_counts_per_doc = (tokens_paired_with_1
            | "Group by Doc,Word" >> beam.GroupByKey()
            | "Count ones" >> beam.Map(lambda ((doc, token), counts): (doc, (token, sum(counts))))
            | "Group by Doc" >> beam.GroupByKey()
        )



    num_docs = (token_counts_per_doc
            | "Get Docs" >> beam.Keys()
            | "Count Docs" >> beam.combiners.Count.Globally()
    )


    word_tf_pre = (
        { 'total_tokens': doc_total_words, 'token_counts_per_doc': token_counts_per_doc }
        | "CoGroup By Document" >> beam.CoGroupByKey()
    )

    def calc_tf((doc, count)):
        [token_count] = count['token_counts_per_doc']

        [tokens_total] = count['total_tokens']

        for token, cnt in token_count:
            yield token, (doc, float(cnt) / tokens_total)


    doc_word_tf = (word_tf_pre
        | "Compute Term Frequencies" >> beam.FlatMap(calc_tf)
        )

    word_occurrences = (tokens
        | "Remove Multiple occurrences per doc" >> beam.RemoveDuplicates()
        | "Pair with 1s" >> beam.Map(lambda (doc, word): (word, 1))
        | "Group by Word" >> beam.GroupByKey()
        | "Sum 1s" >> beam.Map(lambda (word, counts): (word, sum(counts)))
    )

    token_df = (
        word_occurrences
        | "Compute Document Frequency">> beam.Map(lambda (token, count), total: (token, float(count) / total), AsSingleton(num_docs)))

    token_tf_df = (
        { 'term_frequency': doc_word_tf, 'document_frequency': token_df}
        | "CoGroup By Token" >> beam.CoGroupByKey())

    def calc_tfidf((token, tfdf)):
      [df] = tfdf['document_frequency']
      for doc, tf in tfdf['term_frequency']:
        yield (doc, token), tf * math.log(1.0 / df)

    token_tf_idf = (token_tf_df
        | "Calculate TF-IDF Scores" >> beam.FlatMap(calc_tfidf)
    )
    """

    word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True)

    def get_vec(word2vec, token):

        try:
            x = word2vec.get_vector(token)
            x = x.reshape(400)
        except:
            x = np.zeros(400)

        return x

    def analyze_sentiment(x):

        res = perceptron.f(x, w, b)

        return res

    doc_sentiment = (
        tokens_paired_with_1
        | "Create Word2Vec Vector" >> beam.Map(lambda ((doc, token), cnt):
                                               (doc, get_vec(word2vec, token)))
        | "Group Word2Vec Vectors By Document" >> beam.CombinePerKey(sum)
        | "Sum Word2Vec Vectors" >> beam.Map(lambda (doc, vec):
                                             (doc, analyze_sentiment(vec)[0])))

    result = (doc_sentiment | "Format  Results" >>
              beam.Map(lambda (doc, tokens): '%s %s' % (doc, tokens)))

    (result | "Write Results" >> WriteToText("sentiments"))

    p.run()

Exemple #46

0

Afficher le fichier

class TextDatasetReader(DatasetReader):
    """
    Reads raw text, finds replaceable words, generates instance: word with context
    """
    @classmethod
    def read_dict(cls, file_path, limit_words=-1, limit_freq=0):
        word_dict = {}
        with open(file_path) as fd:
            for idx, line in enumerate(fd):
                word, *freq = line.strip().split()

                if idx == limit_words:
                    break

                if len(freq) > 0:
                    freq = freq[0]
                    freq = int(freq)
                    if freq < limit_freq:
                        break
                else:
                    freq = 1

                word_dict[word] = freq

        return word_dict

    def __init__(self,
                 dict_path,
                 limit_words=-1,
                 limit_freq=0,
                 max_context_size: int = 4,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 target_indexers: Dict[str, TokenIndexer] = None):
        """

        :param dict_path: path to the dict of acceptable fords to change
        :param limit_words: Max word count from dictionary
        :param limit_freq: Minimum frequency of words
        :param max_context_size:
        """
        super().__init__(lazy=True)
        self.max_context_size = max_context_size
        self.word_dict = self.read_dict(dict_path, limit_words, limit_freq)

        self.tokenizer = WordPunctTokenizer()
        self.token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }

        self.target_indexer = target_indexers or {
            "target":
            SingleIdTokenIndexer(namespace='target', lowercase_tokens=True),
            "tokens":
            SingleIdTokenIndexer()
        }

        self.left_padding = 'BOS'
        self.right_padding = 'EOS'

    def text_to_instance(self, tokens, idx) -> Instance:

        target_word = tokens[idx]

        left_context, right_context = self.get_context(tokens, idx,
                                                       self.max_context_size)

        if len(left_context) < self.max_context_size:
            left_context = [self.left_padding] + left_context
        if len(right_context) < self.max_context_size:
            right_context = right_context + [self.right_padding]

        left_context = TextField([Token(token) for token in left_context],
                                 self.token_indexers)
        right_context = TextField([Token(token) for token in right_context],
                                  self.token_indexers)

        target_token_field = TextField([Token(target_word)],
                                       self.target_indexer)

        return Instance({
            "left_context": left_context,
            "right_context": right_context,
            "word": target_token_field
        })

    @classmethod
    def get_context(cls, tokens, idx, size):
        """
        >>> TextDatasetReader.get_context([1,2,3,4,5,7], 1, 2)
        ([1], [3, 4])

        >>> TextDatasetReader.get_context([1,2,3,4,5,7], 4, 2)
        ([3, 4], [7])


        :param tokens:
        :param idx:
        :param size:
        :return:
        """
        return tokens[max(idx - size, 0):idx], tokens[idx + 1:idx + size + 1]

    def _read(self, file_path: str) -> Iterable[Instance]:
        with open(file_path) as fd:
            for line in fd:
                tokens = self.tokenizer.tokenize(line)
                for idx, token in enumerate(tokens):
                    if token in self.word_dict:
                        yield self.text_to_instance(tokens, idx)

Exemple #47

0

Afficher le fichier

Fichier : preprocess_cleaning.py Projet : SunnyMarkLiu/Kaggle_Quora_Question_Pairs_Intent

import time
import re
from string import punctuation
# from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import WordPunctTokenizer

from conf.configure import Configure
from utils import data_utils
from utils.text.preprocessor import TextPreProcessor
from utils import jobs
from optparse import OptionParser

# english_stopwords = set(stopwords.words('english'))
word_tokenize = WordPunctTokenizer().tokenize
preprocessor = TextPreProcessor()
stop_words = ['the', 'a', 'an', 'and', 'but', 'if', 'or', 'because', 'as', 'what', 'which', 'this', 'that', 'these',
              'those', 'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for', 'is', 'of', 'while',
              'during', 'to', 'What', 'Which', 'Is', 'If', 'While', 'This']


def get_unigram_words(que):
    """
    获取单一有效词汇
    """
    return [word for word in word_tokenize(que.lower()) if word not in stop_words]


def generate_unigram_words_features(df):
    df['unigrams_ques1'] = df['question1'].apply(lambda x: get_unigram_words(str(x)))

Exemple #48

0

Afficher le fichier

Fichier : en-tokenization-on-punctuation.py Projet : ye-kyaw-thu/tools

# အောက်ပါ WordPunctTokenizer ကတော့ punctuation သင်္ကေတတွေ အားလုံးကို token တစ်ခုစီအနေနဲ့ ဖြတ်ပေးမှာ ဖြစ်ပါတယ်။
from nltk.tokenize import WordPunctTokenizer

# word tokenizing for English with NLTK library
# Written by Ye Kyaw Thu, LST, NECTEC, Thailand
# Date: 12 July 2021
# Reference: Python 3 Text Processing with NLTK 3 Cookbook
# NLTK စာအုပ်ထဲမှာက PunktWordTokenizer အကြောင်းကိုပါ ဆွေးနွေးထားပေမဲ့ နောက်ပိုင်း NLTK version တွေမှာ အဲဒီကောင်က မပါတော့ပါဘူး...
# Reference: https://stackoverflow.com/questions/44238864/importerror-cannot-import-name-punktwordtokenizer/53923708

# How to run:
# $ echo "Don't do it! I can't stand it!" | python ./en-tokenization-on-punctuation.py

parser = argparse.ArgumentParser()
parser.add_argument('inputFile',
                    default=sys.stdin,
                    type=argparse.FileType('r'),
                    nargs='?')

args = parser.parse_args()
textLines = args.inputFile.readlines()

tb_tokenizer = TreebankWordTokenizer()
wp_tokenizer = WordPunctTokenizer()

count = 0
for line in textLines:
    count += 1
    print("Treebank: ", tb_tokenizer.tokenize(line))
    print("WordPunct", wp_tokenizer.tokenize(line))

Exemple #49

0

Afficher le fichier

Fichier : code.py Projet : mahdiipm98/Aspect-based-Sentiment-analysis-for-S-P500-news

    "wouldn't": "would not",
    "aren't": "are not",
    "haven't": "have not",
    "doesn't": "does not",
    "didn't": "did not",
    "don't": "do not",
    "shouldn't": "should not",
    "wasn't": "was not",
    "weren't": "were not",
    "mightn't": "might not",
    "mustn't": "must not"
}
negation_pattern = re.compile(r'\b(' + '|'.join(negations_.keys()) + r')\b')

from nltk.tokenize import WordPunctTokenizer
tokenizer1 = WordPunctTokenizer()
tokenizer2 = WordPunctTokenizer()

corpus_summary = []
for i in range(0, 3000):
    stripped = re.sub(combined_pat, '', dataset2['summary'][i])
    stripped = re.sub(www_pat, '', stripped)
    cleantags = re.sub(html_tag, '', stripped)
    #lower_case = cleantags.lower()
    neg_handled = negation_pattern.sub(lambda x: negations_[x.group()],
                                       cleantags)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    tokens = tokenizer1.tokenize(letters_only)
    tokens = ' '.join(tokens)
    corpus_summary.append(tokens)

Exemple #50

0

Afficher le fichier

import torch
import nltk
from nltk import tokenize
from nltk.tokenize import TweetTokenizer
import json
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
from tqdm import *
from collections import defaultdict
import operator
import random
from nltk.tokenize import WordPunctTokenizer
import h5py

wpt = WordPunctTokenizer()

min_context_len = 20
max_context_len = 350
min_question_len = 2
max_question_len = 30
max_answer_len = 30

def helper(data_path,voc_path, number_data = None):
    
    data = json.load(open(data_path))
    voc = json.load(open(voc_path))

    p_set = []
    p_len_set = []
    p_c_s_e_set = []

Exemple #51

0

Afficher le fichier

def nltkSplit(testDt_List):
    seg_Dt1_List = []
    for i in range(0, len(testDt_List) - 1):
        seg_Dt1 = WordPunctTokenizer().tokenize(testDt_List[i])
        seg_Dt1_List.append(seg_Dt1)
    return seg_Dt1_List

Exemple #52

0

Afficher le fichier

def train_Bayes():

    ripple = pd.read_table('ripple_train.csv', sep=',')
    btc = pd.read_table('btc_train.csv', sep=',')
    bitcoin = pd.read_table('bitcoin_train.csv', sep=',')
    cryptocurrency = pd.read_table('cryptocurrency_train.csv', sep=',')
    cryptomarkets = pd.read_table('cryptomarkets_train.csv', sep=',')
    ethereum = pd.read_table('ethereum_train.csv', sep=',')
    iota = pd.read_table('iota_train.csv', sep=',')
    litecoin = pd.read_table('litecoin_train.csv', sep=',')
    neo = pd.read_table('neo_train.csv', sep=',')
    stellar = pd.read_table('stellar_train.csv', sep=',')

    headlines = ripple['headline']
    headlines.append(btc['headline'])
    headlines.append(bitcoin['headline'])
    headlines.append(cryptocurrency['headline'])
    headlines.append(cryptomarkets['headline'])
    headlines.append(ethereum['headline'])
    headlines.append(iota['headline'])
    headlines.append(litecoin['headline'])
    headlines.append(neo['headline'])
    headlines.append(stellar['headline'])

    labels = ripple['label']
    labels.append(btc['label'])
    labels.append(bitcoin['label'])
    labels.append(cryptocurrency['label'])
    labels.append(cryptomarkets['label'])
    labels.append(ethereum['label'])
    labels.append(iota['label'])
    labels.append(litecoin['label'])
    labels.append(neo['label'])
    labels.append(stellar['label'])

    reformat = token_format(headlines)

    train = list(zip(reformat, labels))

    dictionary = set(word.lower() for passage in train
                     for word in WordPunctTokenizer().tokenize(passage[0]))

    print("First couple of titles and their associated values:")
    print(train[0])
    print(train[1])
    print(train[2])
    print(train[3])

    t = [({
        word: (word in WordPunctTokenizer().tokenize(x[0]))
        for word in dictionary
    }, x[1]) for x in train]

    classifier = nltk.NaiveBayesClassifier.train(t)

    model = open('bayes_model.pickle', 'wb')
    words = open('dictionary.pickle', 'wb')
    pickle.dump(classifier, model)
    pickle.dump(dictionary, words)
    model.close()
    words.close()

Exemple #53

0

Afficher le fichier

Fichier : DataUtils.py Projet : sxlprince/Hierarchical-Attention-Networks-for--Document-Classification

#coding=utf-8
import numpy as np
import json
import pickle
import nltk
from nltk.tokenize import WordPunctTokenizer
from collections import defaultdict

#使用nltk分词分句器
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#print (type(sent_tokenizer))
word_tokenizer = WordPunctTokenizer()
#print (type(word_tokenizer))

#记录每个单词及其出现的频率
word_freq = defaultdict(int)
#print (word_freq)

# 读取数据集，并进行分词，统计每个单词出现次数，保存在word freq中
with open('yelp_academic_dataset_review.json', 'rb') as f:
    for line in f:
        review = json.loads(line.decode('utf-8'))
        words = word_tokenizer.tokenize(review['text'])
        #print (type(words))  #list
        #print (len(words))  包含标点符号
        for word in words:
            word_freq[word] += 1
    #print (review)
    #print (type(review))
    print (word_freq[','])
    print (word_freq['.'])

Exemple #54

0

Afficher le fichier

        yield eval(l)


def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


df = getDF('reviews_Musical_Instruments_5.json.gz')
df.columns
i = 0
word_punct_tokenizer = WordPunctTokenizer()
punct = list(string.punctuation)
stopword_list = stopwords.words('english') + punct + ['rt', 'via']

filt3 = []
uniq = []
i = 0
for index, rows in df.iterrows():
    reviews = rows['reviewText']
    reviews = reviews.lower()
    tokens = nltk.word_tokenize(reviews)
    tokens2 = word_punct_tokenizer.tokenize(reviews)
    filter = [
        word for word in tokens2 if word not in stopwords.words('english')
    ]
    post = nltk.pos_tag(filter)

Exemple #55

0

Afficher le fichier

Fichier : nltk_rtokenizer.py Projet : isi-metaphor/metaphor-adp

parser.add_argument("--normquotes",
                    help="Normalize any quotes to quote single type.",
                    default=1)
parser.add_argument("--wptokenizer",
                    help="Additionally apply treebank tokenizer.",
                    default=1)

pa = parser.parse_args()
sentid = int(pa.sentid)
normquotes = int(pa.normquotes)
wptokenizer = int(pa.wptokenizer)

if __name__ == "__main__":

    st = PunktSentenceTokenizer()
    wtw = WordPunctTokenizer() if wptokenizer == 1 else None
    wtt = TreebankWordTokenizer()

    for line in sys.stdin:

        line = line.decode("utf-8")

        if sentid == 1:
            m = textid_re.search(line)
            if m:
                sys.stdout.write(u".\n{{{%s}}}!!!\n" % m.group(1))
                continue
            if line == "\n":
                continue

        if normquotes == 1:

Exemple #56

0

Afficher le fichier

Fichier : preprocess.py Projet : nadyasafitri/Toxic-Comments

def tokenize_text(text, punct=False):
    text = WordPunctTokenizer().tokenize(text)
    text = [word for word in text if punct or word.isalnum()]
    text = ' '.join(text)
    text = text.strip()
    return text

Exemple #57

0

Afficher le fichier

Fichier : tokenizers.py Projet : itdaniher/blisscribe

def init_word_tokenizer():
    global word_tokenizer
    if word_tokenizer is None:
        word_tokenizer = WordPunctTokenizer()

Exemple #58

0

Afficher le fichier

from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer

parser = argparse.ArgumentParser()
parser.add_argument("--treebank",
                    help="Additionally apply treebank tokenizer.",
                    default=1)

pa = parser.parse_args()
treebank = int(pa.treebank)

if __name__ == "__main__":

    st = PunktSentenceTokenizer()
    wtw = TreebankWordTokenizer()
    wtt = WordPunctTokenizer()

    for line in sys.stdin:

        if line[0:7] == "TEXTID(":
            sys.stdout.write(line)
            continue

        if line == "\n":
            sys.stdout.write(line)
            continue

        if treebank == 0:
            line = line.replace("«", " ' ")
            line = line.replace("»", " ' ")
            line = line.replace("“", " ' ")

Exemple #59

0

Afficher le fichier

Fichier : squad_sup_facts_processor.py Projet : naveenjafer/explain-BERT-QA

class SQUADSupportingFactsProcessor(JiantSupportingFactsProcessor):
    DOC_ID = "squad_sup_facts"

    word_tokenizer = WordPunctTokenizer()
    sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    def process_file(self) -> List:
        """
        Converts a SQuAD dataset file into samples for the Supporting Facts Probing task in Jiant format
        :return: A list of samples in jiant edge probing format.
        """
        squad_data = self.json_from_file(self.input_path)['data']
        samples = []

        for article in squad_data:
            pars = article["paragraphs"]
            for par in pars:
                context = par["context"]

                tokenized_context = self.word_tokenizer.tokenize(context)
                sentences = list(
                    self.sentence_tokenizer.tokenize(context.strip()))

                if len(
                        sentences
                ) < 2:  # There must be at least two sentences in the paragraph
                    continue

                for qa in par["qas"]:
                    targets = []
                    answer = qa["answers"][0]
                    question = qa["question"]
                    question_id = qa["id"]

                    tokenized_question = self.word_tokenizer.tokenize(question)
                    question_length = len(tokenized_question)
                    sample_text = " ".join(tokenized_question) + " "

                    answer_char_position = answer["answer_start"]
                    answer_sentence_index = self.get_sentence_index_from_char_position(
                        answer_char_position, sentences)

                    found_answer_sentence_in_context = False

                    # go through all sentences in context
                    for sentence_index, sentence in enumerate(sentences):

                        tokenized_sentence = self.word_tokenizer.tokenize(
                            sentence)
                        sample_text += " ".join(tokenized_sentence) + " "

                        # get token start position for sentence in context
                        sentence_pos = self.find_sentence_position_in_context(
                            tokenized_context, tokenized_sentence)

                        if sentence_pos is None:
                            continue

                        # define sentence token span for jiant target
                        start_index = sentence_pos + question_length
                        end_index = start_index + len(tokenized_sentence)
                        sentence_span = [start_index, end_index]

                        # if sentence contains answer, set label to "1"
                        if sentence_index == answer_sentence_index:
                            label = "1"
                            found_answer_sentence_in_context = True
                        else:
                            label = "0"

                        targets.append(
                            self.create_target(question_length, sentence_span,
                                               label))

                    if not found_answer_sentence_in_context:
                        # could not find answer in context, skip this example
                        continue

                    sample = {
                        "info": {
                            "doc_id": self.DOC_ID,
                            "q_id": question_id
                        },
                        "text": sample_text.strip(),
                        "targets": targets
                    }

                    samples.append(sample)

        return samples

    @staticmethod
    def find_sentence_position_in_context(context: List,
                                          sentence_tokens: List) -> int:
        """
        Goes through a list of context tokens and tries to find the sentence tokens. If sentence tokens are found, the
        start index is returned.

        :param context: List of tokens in a context document.
        :param sentence_tokens: List of tokens in a sentence, that is supposed to be within the context.
        :return: The start token position of the sentence in the context. If not found returns None.
        """
        for token_index, token in enumerate(context):
            # check if current token equals the first sentence token
            if token == sentence_tokens[0]:

                match = True
                # go through all sentence tokens to see if they match with the following context tokens
                for i in range(1, len(sentence_tokens)):
                    if len(context) > token_index + i and context[
                            token_index + i] == sentence_tokens[i]:
                        continue

                    match = False
                    break
                if match:
                    return token_index

    @staticmethod
    def get_sentence_index_from_char_position(char_pos: int,
                                              sentences: List) -> int:
        """
        Gets a list of sentences from a paragraph and returns the index of the sentence that contains a certain
        character position.
        :param char_pos: Character position in paragraph
        :param sentences: List of paragraph sentences
        :return: Index of the sentence that contains the character
        """
        char_count = 0
        for sentence_index, sentence in enumerate(sentences):
            char_count += len(sentence)
            if char_count >= char_pos:
                return sentence_index

Exemple #60

0

Afficher le fichier

import re
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from utilites import dump, load
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import WordPunctTokenizer
from pymorphy2 import MorphAnalyzer

pst = PunktSentenceTokenizer()
wpt = WordPunctTokenizer()
ma = MorphAnalyzer()


def find_stop_words(text):
    rez = []
    for word in wpt.tokenize(text):
        tags = ma.parse(word)[0].tag
        if 'UNKN' in tags or \
           'LATN' in tags or \
           'PNCT' in tags or \
           'NUMB' in tags or \
           'ROMN' in tags:
            rez += [word]
    return rez


try:
    word_normal_form = load('word_normal_form.json')
except FileNotFoundError: