def number_of_different_words(self):
        # TODO: Stemming, then move to language specific classes
        tokenizer = WordPunctTokenizer()
        words = tokenizer.tokenize(self.text.strip())
        only_textual_words = filter(unicode.isalpha, words)

        return len(set(only_textual_words))
    def tokens(self):
        """Tokenize the text.
        """
        tokenizer = WordPunctTokenizer()

        # Get token character spans.
        spans = list(tokenizer.span_tokenize(self.text))

        # Materialize the token stream.
        tokens = [self.text[c1:c2] for c1, c2 in spans]

        tags = pos_tag(tokens)

        return [

            Token(
                token=token.lower(),
                char1=c1,
                char2=c2,
                pos=pos,
            )

            for (c1, c2), token, (_, pos) in
            zip(spans, tokens, tags)

        ]
def extract_nl_text(ms):
    """
    Extracts and tokenizes text from malware sample object

    :param ms: MalwareSample object
    :return: list of tokenized strings found in malware sample object's internal strings list
    """
    wpt = WordPunctTokenizer()
    all_tokenized_strings_in_ms = []
    inside_xml_privileges = False
    for s in ms.strings:
        if 'requestedPrivileges' in s or 'This program cannot be run in DOS mode' in s:
            continue
        elif inside_xml_privileges:
            continue
        elif '<assembly xmlns' in s:
            inside_xml_privileges = True
            continue
        elif '</assembly>' in s:
            inside_xml_privileges = False
            continue

        tokenized_string = []
        tokens = wpt.tokenize(s)
        if tokens:
            for t in tokens:
                if wordnet.synsets(t) and len(t) > 3:  # had to use length to eliminate false positives
                    tokenized_string.extend(tokens)
                    break
        if tokenized_string:
            all_tokenized_strings_in_ms.append(tokenized_string)
    return all_tokenized_strings_in_ms
def message_to_wordlist(message, lemmas_bool, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    #review_text = BeautifulSoup(review).get_text()
    #
    # 2. Remove messages numbers
    message_text = re.sub(">>\d+","", message)
    message_text = message_text.lower()
    message_text = re.sub(u"ё", 'e', message_text, re.UNICODE)
    message_text = clean_str(message_text)
    tokenizer = WordPunctTokenizer()
    # 3. Convert words to lower case and split them
    words = tokenizer.tokenize(message_text)
    lemmas = []
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if lemmas_bool == 'l':
        for word in words:
            word_parsed = morph.parse(word)
            if len(word_parsed) > 0:
                lemmas.append(word_parsed[0].normal_form)
    elif lemmas_bool == 's':
        for word in words:
            word = stemmer.stem(word)
            if len(word) > 0:
                lemmas.append(word)
    else:
        lemmas = words
    # 5. Return a list of words
    return(lemmas)
Exemple #5
0
def TextProcessor(src, tgt, low=True, num=True):

    print "processing "+src
    if low==True:
        print "lowercasing.."
    if num==True:
        print "removing numeric.."

    srcfile = codecs.open(src,"r","utf-8")
    tgtfile = codecs.open(tgt,"w","utf-8")

    word_punct_tokenizer = WordPunctTokenizer()

    linecount=0
    for line in srcfile:
        linecount+=1
        line = word_punct_tokenizer.tokenize(line)
        if low==True:
            for i in range(0,len(line)):
                line[i] = line[i].lower()
        if num==True:
            for i in range(0,len(line)):
                if line[i].isnumeric()==True:
                    line[i] = "<number>"

        tgtfile.write(listtostring(line))

    srcfile.close()
    tgtfile.close()
    print "done processing "+str(linecount)+" lines!!"
def tokenize_words(sentence):
    """
    :param sentence:
    :return: list of words in sentence
    """
    tokenizer = WordPunctTokenizer()
    return tokenizer.tokenize(sentence)
Exemple #7
0
def class1():
	import nltk
	from nltk.tokenize import WordPunctTokenizer
	docId = request.args.get('d')
	tokenizer = WordPunctTokenizer()		
	collection = initialize_collection('documents')

	featuresets = []
	tagSet = set()
	for d in collection.find():	
		bagOfWords = bag_of_words(tokenizer.tokenize(d['content']))
		if 'tags' not in d: continue
		for tag in d['tags']:
			featuresets.append((bagOfWords, tag))
			tagSet.add(tag)
	classifier = nltk.NaiveBayesClassifier.train(featuresets)

	d = collection.find_one({'_id' : ObjectId(docId)})

	#classifier.show_most_informative_features(100)
	cl = classifier.prob_classify(bag_of_words(tokenizer.tokenize(d['content'])))
	probs = []
	for tag in tagSet:
		probs.append((tag, round(cl.prob(tag)*100) ))
	classifier.show_most_informative_features(n=20)
	probs = sorted(probs, key = lambda x : x[1],  reverse = True)
	return render_template('class1.html', probs = probs, d=d)
def clean_data(input_file_name, output_file_name):
    def clean_word(word):
        word = word.lower()
        word = word.replace('&amp;','&').replace('&lt;','<').replace('&gt;','>').replace('&quot;','"').replace('&#39;',"'")
        word = re.sub(r'(\S)\1+', r'\1\1', word)  # normalize repeated characters to two
        word = re.sub(r'(\S\S)\1+', r'\1\1', word)

        word = word.encode('ascii', 'ignore')

        if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None:
            word = 'GENERIC_HTTP'

        return word.encode('ascii', 'ignore')

    tokenizer = WordPunctTokenizer()

    with gzip.open(input_file_name) as input_file:
        with gzip.open(output_file_name, 'w') as output_file:
            for line in input_file:
                sentences, score = json.loads(line)
                cleaned_sentences = []
                for sentence in sentences:
                    cleaned_sentence = " ".join(map(clean_word, sentence.split()))
                    cleaned_sentences.append(tokenizer.tokenize(cleaned_sentence))

                json.dump([cleaned_sentences, score], output_file)
                output_file.write("\n")
def clean_data(input_file_name, output_file_name):
    def clean_word(word):
        word = word.encode('ascii', 'ignore')
        word = word.lower()
        word = re.sub(r'(\S)\1+', r'\1\1', word)  # normalize repeated characters to two
        word = re.sub(r'(\S\S)\1+', r'\1\1', word)

        if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None:
            word = 'GENERIC_HTTP'

        return word

    tokenizer = WordPunctTokenizer()
    data = []
    with open(input_file_name) as input_file:
        for sentences, label in json.load(input_file):
            cleaned_sentences = []
            for sentence in sentences:
                cleaned_sentence = " ".join(map(clean_word, sentence.split()))
                cleaned_sentence = tokenizer.tokenize(cleaned_sentence)
                cleaned_sentences.append(cleaned_sentence)

            data.append([cleaned_sentences, label])

    with codecs.open(output_file_name, 'w', encoding='utf-8') as output_file:
        json.dump(data, output_file)
Exemple #10
0
def tfIdf():
	TFIDF_MIN_SCORE = 100
	import nltk
	from nltk.tokenize import WordPunctTokenizer
	tokenizer = WordPunctTokenizer()		
	collection = initialize_collection('documents')

	docs = collection.find()
	tfidf = []
	idfMap = create_idf_map()
	docs = collection.find()
	for d in docs:
		tfMap = {}
		for word in set(tokenizer.tokenize(d['content'].lower())):
		 	if word not in tfMap:
		 		tfMap[word] = 1
		 	else:
		 		tfMap[word] += 1
		tfIdfValues = []
		for word in set(tokenizer.tokenize(d['content'].lower())):
			if (tfMap[word] * 1000 / idfMap[word]) > TFIDF_MIN_SCORE:
				tfIdfValues.append((word, tfMap[word] * 1000 / idfMap[word]))
		tfIdfValues = sorted(tfIdfValues, key = lambda x : x[1], reverse = True)
		d['tfidf'] = tfIdfValues
		tfidf.append({'d' : d,
					  'tfidf' : tfIdfValues})
		collection.save(d)


	genFreq = generaral_frequency(idfMap)
	return render_template("tfidf.html", documents = tfidf)
Exemple #11
0
def tokenize(text):
    """Tokenize a raw text.

    Args:
        text (str)

    Returns: list of {token, char1, char2, pos}
    """
    tokenizer = WordPunctTokenizer()

    # Get token character spans.
    spans = list(tokenizer.span_tokenize(text))

    # Materialize the token stream.
    tokens = [text[c1:c2] for c1, c2 in spans]

    # Tag parts-of-speech.
    tags = pos_tag(tokens)

    return [

        dict(
            token=token.lower(),
            char1=c1,
            char2=c2,
            pos=pos,
        )

        for (c1, c2), token, (_, pos) in
        zip(spans, tokens, tags)

    ]
Exemple #12
0
	def words(self, fileid=None):
		"""
		Returns all of the words and puncuation symbols in the specified file
		that were in 'section//p' text nodes.
		"""
		elt = self.xml(fileid).iterfind('.//section//p')
		word_tokenizer = WordPunctTokenizer()
		return [val for subl in [word_tokenizer.tokenize(nodetext) for nodetext in [''.join(el.itertext()) for el in elt]] for val in subl]
Exemple #13
0
 def get_words_without_stopwords(self, text):
     stopwords = nltk.corpus.stopwords.words('english')
     stopwords.extend(string.punctuation)
     stopwords.append('')
     tokenizer = WordPunctTokenizer()
     tokens = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(text) \
               if token.lower().strip(string.punctuation) not in stopwords]
     return tokens
def extract_words(text):
    stemmer = PorterStemmer()

    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)

    result =  [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
    return result
Exemple #15
0
def getBigram(haystack):
    tokenizer = WordPunctTokenizer()
    words = tokenizer.tokenize(haystack)
    bcf = BigramCollocationFinder.from_words(words)
    stopset = set(stopwords.words('english'))
    filter_stops = lambda w: len(w) < 3 or w in stopset
    bcf.apply_word_filter(filter_stops)

    return bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)
def get_tokens(sentence):
    """
    Tokenizes a list of sentences
    :param sentence: list of sentences
    :return: list of tokenized sentences
    """

    tokenizer = WordPunctTokenizer()
    return tokenizer.tokenize(sentence)
def change_db2(text, origin_dict, id):
    print origin_dict
    tokens_ar = []
    word_punct_tokenizer = WordPunctTokenizer()
    for token in word_punct_tokenizer.span_tokenize(origin_dict):
        tokens_ar.append(token)
    for line in text.split("\n"):
        markup_error_line = line.split(';')
        print "MARKUP", markup_error_line
        convert_coord_2dbformat(markup_error_line, tokens_ar, id)
Exemple #18
0
def tokenize(text): 
	tokens = tokenizer.tokenize(text)
	wordtokenizer = WordPunctTokenizer()
	wlist =[]
	for token in tokens:
		wtoken = wordtokenizer.tokenize(token)
		wlist = wlist+wtoken

	stems = stem_tokens(wlist, stemmer)
	return stems
def extract_words(text):
 stemmer = PorterStemmer()
 tokenizer = WordPunctTokenizer()
 tokens = tokenizer.tokenize(text)
 bigram_finder = BigramCollocationFinder.from_words(tokens)
 bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
 for bigram_tuple in bigrams:
  x = "%s %s" % bigram_tuple
  tokens.append(x)
 result =  [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
 return result 
Exemple #20
0
def you_collocations(raw):

    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(raw)

    bigrams = [(tokens[i], tokens[i +1]) for i in range(len(tokens)-1)]
    collocations = [(t1, t2) for (t1, t2) in bigrams if t1 == "you" or t1 == 'your']

    trigrams = [(tokens[i], tokens[i +1], tokens[i+2]) for i in range(len(tokens)-2)]
    trilocations = [(t1, t2, t3) for (t1, t2, t3) in trigrams if t1 == "you" or t1 == 'your']

    return collocations, trilocations
def extract_bigrams(text):
    text = remove_stopwords(text)
    tokenizer = WordPunctTokenizer()
    tokens = [token for token in set(tokenizer.tokenize(text)) if
              not is_number(token) and (is_valid_token(token) or is_name(token))]
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.dice, 500)
    for bigram_tuple in bigrams:
        x = "%s %s" % bigram_tuple
        tokens.append(x)
    result = [x.lower() for x in tokens if x not in stopwords.words("english") and len(x) > 3]
    return result
Exemple #22
0
def get_bigrams(text):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
    result = []
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)

    for bigram_tuple in bigrams:
        x = "%s %s" % bigram_tuple
        tokens.append(x)

    return tokens
Exemple #23
0
def build_word_dictionary(input_file_name, output_file_name):
    dictionary = Counter()
    tokenizer = WordPunctTokenizer()
    with open(input_file_name) as input_file:
        for record in json.loads(input_file.read()):
            dictionary.update(tokenizer.tokenize(record['content']))
            dictionary.update(tokenizer.tokenize(record['abstract']))

    dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN']

    with open(output_file_name, 'w') as output_file:
        output_file.write("{}\n".format(json.dumps(dictionary)))
Exemple #24
0
def extract_words(text):
 	
    stemmer = PorterStemmer()
    if type(text) == str:
        text = unicode(text, "utf-8", errors="ignore")
    else:
        text = unicode(text)
     
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
 
    result =  [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
    return result
Exemple #25
0
def analyze(tweets):
    classifier = cache.get('classifier')
    if classifier is None:
        classifier = train_classifier()
        cache.set('classifier', classifier, None)
    tokenizer = WordPunctTokenizer()
    analyzed_tweets = []
    for tweet in tweets:
        tokens = tokenizer.tokenize(tweet.lower())
        featureset = word_feats(tokens)
        sentiment = classifier.prob_classify(featureset)
        analyzed_tweets.append(AnalyzedTweet(tweet, round(sentiment.prob('pos'),2), round(sentiment.prob('neg'),2)))
    return analyzed_tweets
def build_word_dictionary(input_file_name, output_file_name):
    dictionary = Counter()
    with open(input_file_name) as input_file:
        for line in json.loads(input_file.read()):
            text, label = line
            tokenizer = WordPunctTokenizer()
            dictionary.update(tokenizer.tokenize(text))

    dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN']
    # dictionary = list(sorted(w for w,c in dictionary.most_common(3000))) + ['PADDING', 'UNKNOWN']

    with open(output_file_name, 'w') as output_file:
        output_file.write("{}\n".format(json.dumps(dictionary)))
Exemple #27
0
def build_word_dictionary(input_file_name, output_file_name):
    dictionary = Counter()
    with gzip.open(input_file_name) as input_file:
        for line in json.loads(input_file.read()):
            text, label = line
            # dictionary.update(text.split())
            tokenizer = WordPunctTokenizer()
            dictionary.update(tokenizer.tokenize(text))

    dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 3)) + ["PADDING", "UNKNOWN"]

    with open(output_file_name, "w") as output_file:
        output_file.write("{}\n".format(json.dumps(dictionary)))
def OnButtonClick ():
        file = tkFileDialog.askopenfile(parent=root,mode='rb',title='Select a file')
        if file != None:
            print "Initializing... Please Wait"
            ini_db()
            
            file_list=file.readlines()

            for line in file_list:
                
                line=line.strip()
                fp1=open(line,"r")
                document_count()
                text=fp1.read()    
                #dictonary to store word frequency in text(temporary)
                doc_word_freq={}
                #Tokenize 
                from nltk.tokenize import WordPunctTokenizer
                tokenizer = WordPunctTokenizer()
                text2=tokenizer.tokenize(text)
            



                #removing stopwords
                from nltk.corpus import stopwords
                eng_stop=set(stopwords.words('english'))
                text3=[word for word in text2 if word not in eng_stop]

                #pos tag
                import nltk
                text4=nltk.pos_tag(text3)
                text5=filter_for_tags(text4)


                #calculate frequency of word in the text
                for word in text5:
                    if word in doc_word_freq:
                        doc_word_freq[word] += 1
                    else:
                        if(word != "'"):
                            doc_word_freq[word] = 1

                #update occurance of word in global table
                for (word,freq) in doc_word_freq.items():
                    if (check(word)):
                        update_record(word)
                    else:
                        add_new_word(word)
            print "Initialization Done...\n\n"
            file.close()
Exemple #29
0
def convert(sgm_path, apf_path, bio_path=None):
    xml_parser = etree.XMLParser(recover=True)
    try:
        sgm_tree = etree.parse(sgm_path, xml_parser)
        apf_tree = etree.parse(apf_path, xml_parser)
        if not bio_path:
            bio_path = os.path.commonprefix([sgm_path, apf_path]) + 'bio'
        output = open(bio_path, 'w')
    except:
        print 'Something wrong when opening/parsing xml file, or opening output file'
        return
    
    init_offset = get_init_offset(sgm_path)
    text = sgm_tree.xpath('/DOC/BODY/TEXT')[0].text.strip('\n')
    
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
    spans = list(tokenizer.span_tokenize(text))
    pos = pos_tag(tokens)
    
    ts = []
    for i in range(len(tokens)):
        t = token()
        t.text = tokens[i]
        t.pos = pos[i][1]
        t.span = (spans[i][0] + init_offset, spans[i][1] - 1 + init_offset)
        t.bio = 'O'
        ts.append(t)
        
    entits = apf_tree.xpath('/source_file/document/entity')
    for enty in entits:
        enty_type = enty.get('TYPE')
        mentions = enty.xpath('entity_mention')
        for m in mentions:
            head = m.xpath('head')[0]
            span = (int(head[0].get('START')), int(head[0].get('END')))
            found = False
            for t in ts:
                if t.span[0] == span[0]:
                    t.bio = 'B-' + enty_type
                    found = True
                if t.span[0] > span[0] and t.span[1] <= span[1]:
                    t.bio = 'I-' + enty_type
                    found = True
            if not found:
                print 'entity mention head span not found', span, apf_path
    
    for t in ts:
        #print t.text, t.span
        output.write('\t'.join([t.text, t.pos, t.bio]) + '\n')
    output.close()
Exemple #30
0
 def word_tokenizePT(self,  text, tokenizer):
     """ tokenize a portuguese sentence in words
     @input params: sentence - a sentence, a phrase (self)
                    tokenizer - "TB" for TreebankWordTokenizer
                                "WP" for WordPunctTokenizer
     @returns word's list or error """
     if tokenizer == "TB":
         tokenizerTB = TreebankWordTokenizer()
         return tokenizerTB.tokenize(text)
     elif tokenizer == "WP":
         tokenizerWP = WordPunctTokenizer()
         return tokenizerWP.tokenize(text)
     else:
         return "tokenizer error: not found" 
Exemple #31
0
	def wordtokenizer(sentence):
	    words=WordPunctTokenizer().tokenize(sentence)
	    return words
Exemple #32
0
 def __init__(self):
     self.tokenizer = WordPunctTokenizer()
     #加载模型word2Vec
     self.word2VecModel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)
Exemple #33
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer

tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not","weren't":"were not",
"haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not", "wouldn't":"would not", 
"don't":"do not", "doesn't":"doesnot","didn't":"did not", "can't":"can not","couldn't":"could not",
"shouldn't":"should not","mightn't":"might not", "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner(text):
  soup = BeautifulSoup(text, 'lxml')
  souped = soup.get_text()
    try:
      bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
      bom_removed = souped
  stripped = re.sub(combined_pat, '', bom_removed)
  stripped = re.sub(www_pat, '', stripped)
  lower_case = stripped.lower()
  neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
  letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
  words = [x for x in tok.tokenize(letters_only) if len(x)>1]
def main():
    tokenizer = WordPunctTokenizer()

    posts = []

    # 'questions-textonly.txt'
    with open(sys.argv[1], 'r') as f:
        for line in f:
            line = line[:-1]
            posts.append(line)

    # 'answers-textonly.txt'
    with open(sys.argv[2], 'r') as f:
        for line in f:
            line = line[:-1]
            posts.append(line)

    #print(len(posts))

    posts_lengths = []

    for post in posts:
        tokens = tokenizer.tokenize(post)
        tokenCount = len(tokens)
        posts_lengths.append(tokenCount)

    #print(len(posts_lengths))

    posts_lengths.sort()

    posts_lengths_unique = set(posts_lengths)

    #print(len(posts_lengths_unique))

    posts_lengths_unique_list = list(posts_lengths_unique)

    posts_lengths_count = []

    prevCount = posts_lengths[0]
    currCount = posts_lengths[0]
    n = 0

    for i in range(len(posts_lengths)):
        currCount = posts_lengths[i]
        if (currCount == prevCount):
            n += 1
        else:
            posts_lengths_count.append(n)
            n = 1
        prevCount = currCount

    posts_lengths_count.append(n)

    #print(len(posts_lengths_count))

    #posts_lengths_unique_list.index(21)
    #posts_lengths_unique_list.index(101)
    #posts_lengths_unique_list.index(502)

    posts_lengths_counts = np.array(posts_lengths_count)
    posts_lengths_Counts = np.array([])
    posts_lengths_Counts = np.append(posts_lengths_Counts,
                                     posts_lengths_counts[0])
    posts_lengths_Counts = np.append(posts_lengths_Counts,
                                     posts_lengths_counts[1:21].sum())
    posts_lengths_Counts = np.append(posts_lengths_Counts,
                                     posts_lengths_counts[21:101].sum())
    posts_lengths_Counts = np.append(posts_lengths_Counts,
                                     posts_lengths_counts[101:387].sum())
    posts_lengths_Counts = np.append(posts_lengths_Counts,
                                     posts_lengths_counts[387:].sum())

    posts_labels = np.char.array(['0', '1~20', '21~100', '101~500', '>500'])
    percents = 100. * posts_lengths_Counts / posts_lengths_Counts.sum()
    labels = [
        '{0} : {1:1.2f} % '.format(label, percentage)
        for label, percentage in zip(posts_labels, percents)
    ]

    patches, texts = plt.pie(posts_lengths_Counts, shadow=True, startangle=90)
    plt.legend(patches,
               labels,
               bbox_to_anchor=(0.2, 0.27),
               loc=1,
               fontsize='medium',
               borderaxespad=1.0)
    plt.title('Distribution of posts having X number of tokens')
    plt.savefig('posts_distribution.png')
    plt.show()
Exemple #35
0
import flair
import torch
from flair.models import SequenceTagger
from nltk.tokenize import WordPunctTokenizer, PunktSentenceTokenizer

flair.device = torch.device('cpu')

word_tokenizer = WordPunctTokenizer()
tagger = SequenceTagger.load('fr-ner')
sent_tokenizer = PunktSentenceTokenizer(
    "nltk_data/tokenizers/punkt/french.pickle")
class Pipeline:
    def __init__(self):
        self.tokenizer = WordPunctTokenizer()
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 1),
                                          max_features=50000)
        # self.classifier = LinearSVC(random_state=seed)
        self.classifier = LogisticRegression(random_state=seed,
                                             multi_class='multinomial')
        # self.classifier = RidgeClassifier(random_state=seed)
        # self.classifier = KNeighborsClassifier(n_jobs=4, n_neighbors=1)
        # self.classifier = KNeighborsClassifier(n_jobs=4, n_neighbors=3)
        # self.classifier = KNeighborsClassifier(n_jobs=4, n_neighbors=5)
        self.classifier = Perceptron(random_state=seed)
        # Raw file
        self.train_file = "raw/train_tweets.txt"
        self.test_file = "raw/test_tweets_unlabeled.txt"
        # Cleaned file
        self.train_file_cleaned = "data/train_tweets_cleaned.txt"
        self.test_file_cleaned = "data/test_tweets_cleaned.txt"
        self.total_file_cleaned = "data/total_tweets_cleaned.txt"
        # Vector File
        self.train_vector = "vector/train.vec"
        self.test_vector = "vector/test.vec"
        # Label File
        self.train_label = "label/train_label.txt"
        self.test_label = "label/test_label.csv"

    def tokenize(self):
        print("Tokenizing...")
        train_file_cleaned = open(self.train_file_cleaned, 'w')
        test_file_cleaned = open(self.test_file_cleaned, 'w')
        total_file_cleaned = open(self.total_file_cleaned, 'w')
        train_label = open(self.train_label, 'w')
        with open(self.train_file) as train_data:
            for line in train_data:
                label, tweet = line.strip().split('\t', 1)[:2]
                train_label.write(label + '\n')
                tokenized_tweet = " ".join(self.tokenizer.tokenize(tweet))
                train_file_cleaned.write(tokenized_tweet + '\n')
                total_file_cleaned.write(tokenized_tweet + '\n')
        with open(self.test_file) as test_data:
            for line in test_data:
                tokenized_tweet = " ".join(self.tokenizer.tokenize(line))
                test_file_cleaned.write(tokenized_tweet + '\n')
                total_file_cleaned.write(tokenized_tweet + '\n')

    def vectorize(self):
        print("Fitting vectorizer...")
        self.vectorizer.fit(open(self.total_file_cleaned))
        print("Vectorizing train file...")
        train_vector = self.vectorizer.transform(open(self.train_file_cleaned))
        print("Train vector: ", train_vector.shape)
        print("Vectorizing test file...")
        test_vector = self.vectorizer.transform(open(self.test_file_cleaned))
        print("Test vector: ", test_vector.shape)
        print("Saving...")
        pickle.dump(train_vector, open(self.train_vector, 'wb'))
        pickle.dump(test_vector, open(self.test_vector, 'wb'))

    def evaluate(self):
        train_vector = pickle.load(open(self.train_vector, 'rb'))
        train_label = []
        with open(self.train_label) as file:
            for line in file:
                train_label.append(int(line))
        print("Total Data: ", train_vector.shape)
        X_train, X_evl, y_train, y_evl = train_test_split(train_vector,
                                                          train_label,
                                                          test_size=0.5,
                                                          random_state=seed)
        _, X_train, _, y_train = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.1,
                                                  random_state=seed)
        _, X_evl, _, y_evl = train_test_split(X_evl,
                                              y_evl,
                                              test_size=0.1,
                                              random_state=seed)
        print(
            "Training set has {} instances. Test set has {} instances.".format(
                X_train.shape[0], X_evl.shape[0]))
        start = time.time()
        print("Training Classifier...")
        self.classifier.fit(X_train, y_train)
        pred_labels = self.classifier.predict(X_evl)
        print("Training successfully in %s seconds " %
              int(time.time() - start))
        print("Evaluate Accuracy: %0.2f" %
              (accuracy_score(y_evl, pred_labels) * 100))

    def classify(self):
        train_vector = pickle.load(open(self.train_vector, 'rb'))
        train_label = []
        with open(self.train_label) as file:
            for line in file:
                train_label.append(int(line))
        print("Total Data: ", train_vector.shape)
        start = time.time()
        print("Training Classifier...")
        self.classifier.fit(train_vector, train_label)
        print("Training successfully in %s seconds " %
              int(time.time() - start))
        print("Predicting...")
        test_vector = pickle.load(open(self.test_vector, 'rb'))
        test_label = self.classifier.predict(test_vector)
        df = pd.DataFrame(test_label, columns=['Predicted'])
        df.index += 1
        df.index.name = 'Id'
        df.to_csv(self.test_label)
Exemple #37
0
from polyglot.text import Text
import nltk
import re
import spacy
import pymongo
from tqdm import tqdm
import json
from joblib import Parallel, delayed
from fuzzywuzzy import process
import gensim
import threading
import gensim.corpora as corpora
import pandas as pd
import config

tok = WordPunctTokenizer()
sent_detector = nltk.tokenize.punkt.PunktSentenceTokenizer()

nlp = spacy.load('en_core_web_sm')
nlp_spacy = spacy.load('en_core_web_sm')
from config import nlp_corenlp

nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


class AsyncNLPProcess(threading.Thread):
    def __init__(self, Task_Complete):
        super().__init__()
        self.Task_Complete = Task_Complete
def run():
    # Sentences From Text
    _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle")
    word_tokenizer = WordPunctTokenizer()
    abbreviations = set()
    with open("./tokenizer/abbreviations-long.txt") as f:
        for l in f:
            abbreviations.add(l.split(':')[0])

    _sentence_tokenizer._params.abbrev_types = abbreviations

    def sentences_from_text(text):
        return _sentence_tokenizer.tokenize(text.strip())

    def tokens_from_sentence(sentence):
        return sentence.split()  # nltk.word_tokenize(sentence)

    def ngrams(obj, n):
        tokens = []
        sentences = (sentences_from_text(obj["title"]) +
                     sentences_from_text(obj["description"]) +
                     sentences_from_text(obj["content"]))

        for sentence in sentences:
            tokens += tokens_from_sentence(sentence)

        pairs = nltk.ngrams(tokens, n)
        return [" ".join(pair) for pair in pairs]

    def convertToJsonObj(jsonText):
        return simplejson.loads(jsonText)

    def convertToObject(jsonObj):
        x = jsonObj

        obj = {
            "title":
            x.get("properties", {}).get("title", {}).get("stringValue", ""),
            "link":
            x.get("properties", {}).get("link", {}).get("stringValue", ""),
            "published":
            x.get("properties", {}).get("published",
                                        {}).get("stringValue", ""),
            "description":
            x.get("properties", {}).get("description",
                                        {}).get("stringValue", ""),
            "content":
            x.get("properties", {}).get("content", {}).get("stringValue", ""),
        }

        obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4())

        return obj

    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    def cleanhtml(raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    def removeHTMLFromStrings(obj):
        for key in obj.keys():
            obj[key] = cleanhtml(obj[key])

        return obj

    def tokenize_to_sentences(obj):

        obj["sentences"] = (sentences_from_text(obj["title"]) +
                            sentences_from_text(obj["description"]) +
                            sentences_from_text(obj["content"]))

        return obj

    def tokenize_to_words(obj):

        obj["tokens"] = []

        for sentence in obj["sentences"]:
            obj["tokens"] += tokens_from_sentence(sentence)

        for token in obj["tokens"]:
            yield (obj["key"], token)

    def get_named_entities(mdl, tokens):
        stemmer = TurkishStemmer()
        res = mdl.analyze(tokens)
        entities = []
        for entity in res["entities"]:
            for entity2 in entity["text"].split(", "):
                ne = stemmer.stem(entity2).split("'")[0]
                entities.append((entity["type"], ne, entity["score"]))
        return entities

    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'

    p = beam.Pipeline(options=options)

    pairs = (
        p
        | "Read From Text" >>
        ReadFromText("news.json",
                     coder=beam.coders.coders.StrUtf8Coder())  # line by line
        | "Convert to Json Object" >> beam.Map(convertToJsonObj)
        | "Convert to Python Object" >> beam.Map(convertToObject)
        | "Remove HTML Tags From Strings (Normalization 1)" >>
        beam.Map(removeHTMLFromStrings))

    tokens_1gram = (
        pairs
        | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences)
        | 'Word Tokenization' >> beam.FlatMap(
            tokenize_to_words)  # also convert to key value pairs
    )

    tokens = tokens_1gram

    def process_tokens_last(doc, tokens):
        return (doc, get_named_entities(tokens))

    doc_named_entities = (
        tokens
        | beam.GroupByKey()
        #     | beam.Map(lambda (doc, tokens): process_tokens_last(mdl, tokens))
    )

    (doc_named_entities | "Write Results" >> WriteToText("doc_tokens"))

    p.run()
from nltk.tag import pos_tag
from konlpy.tag import Okt, Kkma

tokenizer = TreebankWordTokenizer()

okt = Okt()

kkma = Kkma()

print(
    word_tokenize(
        "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
    ))  ## 모두 token 화

print(WordPunctTokenizer().tokenize(
    "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
))  # ''' 단위는 띄움

print(
    text_to_word_sequence(
        "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."
    ))  #"don't 는 하나로 인식

text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."  # home-based 하나로 인식, does n't로 인식 --> 일반 word tokenizer와 동일

print(tokenizer.tokenize(text))

sentence = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near."
print(sent_tokenize(sentence))

korean_sentence = "딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요. 이제 해보면 알걸요?"
 def get_tokensForBigData(self,text):
     tokens = WordPunctTokenizer().tokenize(text)
     words = [x for x in tokens if x not in string.punctuation
              and x not in ['.','."', '".', '?"', '!"', '%"', '%.','@']]
     return words
__author__ = 'mdenil'
 def get_tokens(self,text):
     textLow = text.lower()
     tokens = WordPunctTokenizer().tokenize(textLow)
     words = [x for x in tokens if x not in string.punctuation and x not in ['."', '".', '?"', '!"', '%"', '%.']]
     return words
Exemple #43
0
    open('../data/parent', 'rb')
)  # parent is a dict(), which stores the ids of each query's duplicate questions

querys = read_data.methods_to_classes(read_data.read_querys_from_file())
#querys = querys[0:100]
print 'loading data finished'

mrr = 0.0
map = 0.0

for item in querys:

    query = item[0]
    true_apis = item[1]

    query_words = WordPunctTokenizer().tokenize(query.lower())
    query_words = [
        SnowballStemmer('english').stem(word) for word in query_words
    ]

    query_matrix = similarity.init_doc_matrix(query_words, w2v)
    query_idf_vector = similarity.init_doc_idf_vector(query_words, idf)

    top_questions = recommendation.get_topk_questions(query, query_matrix,
                                                      query_idf_vector,
                                                      questions, 50, parent)
    recommended_api = recommendation.recommend_api_class(
        query_matrix, query_idf_vector, top_questions, questions, javadoc,
        javadoc_dict_classes, -1)
    #recommended_api = recommendation.recommend_api_class_baseline(query_matrix,query_idf_vector,javadoc,-1)
            re.compile('^' + l.strip() + '$')
            for l in open(os.path.join(LIWC_dir, '%s' % (c)), 'r')
            if l.strip() not in stopwords
        ]
        for c in LIWC_categories
    }
    # replace positive/negative affect
    LIWC_categories += ['positive', 'negative']
    LIWC_categories.remove('positive_affect')
    LIWC_categories.remove('negative_affect')
    LIWC_category_wordlists['positive'] = LIWC_category_wordlists.pop(
        'positive_affect')
    LIWC_category_wordlists['negative'] = LIWC_category_wordlists.pop(
        'negative_affect')

    TKNZR = WordPunctTokenizer()
    full_slice_list = set(range(N_SLICES))
    # we count either the total number of tokens
    # or the number of unique tokens
    # count_option = 'total'
    count_option = 'unique'
    data = pd.read_csv(sub_file, sep='\t', index_col=False)
    data.sort_values('slice', ascending=True)
    fname = os.path.basename(sub_file).replace('.tsv', '')
    out_dir = os.path.dirname(sub_file)
    empty_slices = full_slice_list - set(data['slice'].unique())
    if (len(empty_slices) > 0):
        print('filling %s with empty slices %s' % (e_name, empty_slices))
        empty_slice_rows = pd.DataFrame([{
            'slice': c,
            'dialogue': ''
Exemple #45
0
def run():
    import pickle
    import sys

    import math

    import numpy as np
    import apache_beam as beam

    reload(sys)
    sys.setdefaultencoding('utf8')

    import argparse
    import simplejson
    from gensim.models import KeyedVectors

    from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
    from apache_beam.io.textio import ReadFromText, WriteToText
    import nltk.data
    from nltk.tokenize import WordPunctTokenizer
    import re
    import uuid
    import perceptron
    # Sentences From Text
    _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle")
    word_tokenizer = WordPunctTokenizer()
    abbreviations = set()
    with open("./tokenizer/abbreviations-long.txt") as f:
        for l in f:
            abbreviations.add(l.split(':')[0])

    _sentence_tokenizer._params.abbrev_types = abbreviations

    model_file = "perceptron_word2vec_stemmed_normalized.pickle"
    with open(model_file, 'rb') as model:
        w, b = pickle.load(model)

    def sentences_from_text(text):
        return _sentence_tokenizer.tokenize(text.strip())

    def tokens_from_sentence(sentence):
        return nltk.word_tokenize(sentence)

    def ngrams(obj, n):
        tokens = []
        sentences = (sentences_from_text(obj["title"]) +
                     sentences_from_text(obj["description"]) +
                     sentences_from_text(obj["content"]))

        for sentence in sentences:
            tokens += tokens_from_sentence(sentence)

        pairs = nltk.ngrams(tokens, n)
        return [" ".join(pair) for pair in pairs]

    def convertToJsonObj(jsonText):
        return simplejson.loads(jsonText)

    def convertToObject(jsonObj):
        x = jsonObj

        obj = {
            "title":
            x.get("properties", {}).get("title", {}).get("stringValue", ""),
            "link":
            x.get("properties", {}).get("link", {}).get("stringValue", ""),
            "published":
            x.get("properties", {}).get("published",
                                        {}).get("stringValue", ""),
            "description":
            x.get("properties", {}).get("description",
                                        {}).get("stringValue", ""),
            "content":
            x.get("properties", {}).get("content", {}).get("stringValue", ""),
        }

        obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4())

        return obj

    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    def cleanhtml(raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    def removeHTMLFromStrings(obj):
        for key in obj.keys():
            obj[key] = cleanhtml(obj[key])

        return obj

    def tokenize_to_sentences(obj):

        obj["sentences"] = (sentences_from_text(obj["title"]) +
                            sentences_from_text(obj["description"]) +
                            sentences_from_text(obj["content"]))

        return obj

    def tokenize_to_words(obj):

        obj["tokens"] = []

        for sentence in obj["sentences"]:
            obj["tokens"] += tokens_from_sentence(sentence)

        for token in obj["tokens"]:
            yield (obj["key"], token)

    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'

    p = beam.Pipeline(options=options)

    pairs = (
        p
        | "Read From Text" >>
        ReadFromText("news.json",
                     coder=beam.coders.coders.StrUtf8Coder())  # line by line
        | "Convert to Json Object" >> beam.Map(convertToJsonObj)
        | "Convert to Python Object" >> beam.Map(convertToObject)
        | "Remove HTML Tags From Strings (Normalization 1)" >>
        beam.Map(removeHTMLFromStrings))

    tokens_1gram = (
        pairs
        | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences)
        | 'Word Tokenization' >> beam.FlatMap(
            tokenize_to_words)  # also convert to key value pairs
    )
    """
    tokens_2gram = (pairs
            | "Create 2-grams" >> beam.FlatMap(lambda obj: [(obj["key"], token) for token in ngrams(obj, 2)])
        )
    """

    tokens = tokens_1gram
    """
    vocabulary = (tokens
            | "Get words only" >> beam.Values()
            | "Remove duplicate words" >> beam.RemoveDuplicates()
        )
    vocabulary_size = (vocabulary
            | "Count Vocabulary elements" >> beam.combiners.Count.Globally()
        )

    doc_total_words = (tokens
            | "Count Words of Doc" >> beam.combiners.Count.PerKey()
    )
    """

    tokens_paired_with_1 = (
        tokens
        | "Pair with 1" >> beam.Map(lambda (doc, token): ((doc, token), 1)))
    """
    token_counts_per_doc = (tokens_paired_with_1
            | "Group by Doc,Word" >> beam.GroupByKey()
            | "Count ones" >> beam.Map(lambda ((doc, token), counts): (doc, (token, sum(counts))))
            | "Group by Doc" >> beam.GroupByKey()
        )



    num_docs = (token_counts_per_doc
            | "Get Docs" >> beam.Keys()
            | "Count Docs" >> beam.combiners.Count.Globally()
    )


    word_tf_pre = (
        { 'total_tokens': doc_total_words, 'token_counts_per_doc': token_counts_per_doc }
        | "CoGroup By Document" >> beam.CoGroupByKey()
    )

    def calc_tf((doc, count)):
        [token_count] = count['token_counts_per_doc']

        [tokens_total] = count['total_tokens']

        for token, cnt in token_count:
            yield token, (doc, float(cnt) / tokens_total)


    doc_word_tf = (word_tf_pre
        | "Compute Term Frequencies" >> beam.FlatMap(calc_tf)
        )

    word_occurrences = (tokens
        | "Remove Multiple occurrences per doc" >> beam.RemoveDuplicates()
        | "Pair with 1s" >> beam.Map(lambda (doc, word): (word, 1))
        | "Group by Word" >> beam.GroupByKey()
        | "Sum 1s" >> beam.Map(lambda (word, counts): (word, sum(counts)))
    )

    token_df = (
        word_occurrences
        | "Compute Document Frequency">> beam.Map(lambda (token, count), total: (token, float(count) / total), AsSingleton(num_docs)))

    token_tf_df = (
        { 'term_frequency': doc_word_tf, 'document_frequency': token_df}
        | "CoGroup By Token" >> beam.CoGroupByKey())

    def calc_tfidf((token, tfdf)):
      [df] = tfdf['document_frequency']
      for doc, tf in tfdf['term_frequency']:
        yield (doc, token), tf * math.log(1.0 / df)

    token_tf_idf = (token_tf_df
        | "Calculate TF-IDF Scores" >> beam.FlatMap(calc_tfidf)
    )
    """

    word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True)

    def get_vec(word2vec, token):

        try:
            x = word2vec.get_vector(token)
            x = x.reshape(400)
        except:
            x = np.zeros(400)

        return x

    def analyze_sentiment(x):

        res = perceptron.f(x, w, b)

        return res

    doc_sentiment = (
        tokens_paired_with_1
        | "Create Word2Vec Vector" >> beam.Map(lambda ((doc, token), cnt):
                                               (doc, get_vec(word2vec, token)))
        | "Group Word2Vec Vectors By Document" >> beam.CombinePerKey(sum)
        | "Sum Word2Vec Vectors" >> beam.Map(lambda (doc, vec):
                                             (doc, analyze_sentiment(vec)[0])))

    result = (doc_sentiment | "Format  Results" >>
              beam.Map(lambda (doc, tokens): '%s %s' % (doc, tokens)))

    (result | "Write Results" >> WriteToText("sentiments"))

    p.run()
Exemple #46
0
class TextDatasetReader(DatasetReader):
    """
    Reads raw text, finds replaceable words, generates instance: word with context
    """
    @classmethod
    def read_dict(cls, file_path, limit_words=-1, limit_freq=0):
        word_dict = {}
        with open(file_path) as fd:
            for idx, line in enumerate(fd):
                word, *freq = line.strip().split()

                if idx == limit_words:
                    break

                if len(freq) > 0:
                    freq = freq[0]
                    freq = int(freq)
                    if freq < limit_freq:
                        break
                else:
                    freq = 1

                word_dict[word] = freq

        return word_dict

    def __init__(self,
                 dict_path,
                 limit_words=-1,
                 limit_freq=0,
                 max_context_size: int = 4,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 target_indexers: Dict[str, TokenIndexer] = None):
        """

        :param dict_path: path to the dict of acceptable fords to change
        :param limit_words: Max word count from dictionary
        :param limit_freq: Minimum frequency of words
        :param max_context_size:
        """
        super().__init__(lazy=True)
        self.max_context_size = max_context_size
        self.word_dict = self.read_dict(dict_path, limit_words, limit_freq)

        self.tokenizer = WordPunctTokenizer()
        self.token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }

        self.target_indexer = target_indexers or {
            "target":
            SingleIdTokenIndexer(namespace='target', lowercase_tokens=True),
            "tokens":
            SingleIdTokenIndexer()
        }

        self.left_padding = 'BOS'
        self.right_padding = 'EOS'

    def text_to_instance(self, tokens, idx) -> Instance:

        target_word = tokens[idx]

        left_context, right_context = self.get_context(tokens, idx,
                                                       self.max_context_size)

        if len(left_context) < self.max_context_size:
            left_context = [self.left_padding] + left_context
        if len(right_context) < self.max_context_size:
            right_context = right_context + [self.right_padding]

        left_context = TextField([Token(token) for token in left_context],
                                 self.token_indexers)
        right_context = TextField([Token(token) for token in right_context],
                                  self.token_indexers)

        target_token_field = TextField([Token(target_word)],
                                       self.target_indexer)

        return Instance({
            "left_context": left_context,
            "right_context": right_context,
            "word": target_token_field
        })

    @classmethod
    def get_context(cls, tokens, idx, size):
        """
        >>> TextDatasetReader.get_context([1,2,3,4,5,7], 1, 2)
        ([1], [3, 4])

        >>> TextDatasetReader.get_context([1,2,3,4,5,7], 4, 2)
        ([3, 4], [7])


        :param tokens:
        :param idx:
        :param size:
        :return:
        """
        return tokens[max(idx - size, 0):idx], tokens[idx + 1:idx + size + 1]

    def _read(self, file_path: str) -> Iterable[Instance]:
        with open(file_path) as fd:
            for line in fd:
                tokens = self.tokenizer.tokenize(line)
                for idx, token in enumerate(tokens):
                    if token in self.word_dict:
                        yield self.text_to_instance(tokens, idx)
import time
import re
from string import punctuation
# from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import WordPunctTokenizer

from conf.configure import Configure
from utils import data_utils
from utils.text.preprocessor import TextPreProcessor
from utils import jobs
from optparse import OptionParser

# english_stopwords = set(stopwords.words('english'))
word_tokenize = WordPunctTokenizer().tokenize
preprocessor = TextPreProcessor()
stop_words = ['the', 'a', 'an', 'and', 'but', 'if', 'or', 'because', 'as', 'what', 'which', 'this', 'that', 'these',
              'those', 'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for', 'is', 'of', 'while',
              'during', 'to', 'What', 'Which', 'Is', 'If', 'While', 'This']


def get_unigram_words(que):
    """
    获取单一有效词汇
    """
    return [word for word in word_tokenize(que.lower()) if word not in stop_words]


def generate_unigram_words_features(df):
    df['unigrams_ques1'] = df['question1'].apply(lambda x: get_unigram_words(str(x)))
# အောက်ပါ WordPunctTokenizer ကတော့ punctuation သင်္ကေတတွေ အားလုံးကို token တစ်ခုစီအနေနဲ့ ဖြတ်ပေးမှာ ဖြစ်ပါတယ်။
from nltk.tokenize import WordPunctTokenizer

# word tokenizing for English with NLTK library
# Written by Ye Kyaw Thu, LST, NECTEC, Thailand
# Date: 12 July 2021
# Reference: Python 3 Text Processing with NLTK 3 Cookbook
# NLTK စာအုပ်ထဲမှာက PunktWordTokenizer အကြောင်းကိုပါ ဆွေးနွေးထားပေမဲ့ နောက်ပိုင်း NLTK version တွေမှာ အဲဒီကောင်က မပါတော့ပါဘူး...
# Reference: https://stackoverflow.com/questions/44238864/importerror-cannot-import-name-punktwordtokenizer/53923708

# How to run:
# $ echo "Don't do it! I can't stand it!" | python ./en-tokenization-on-punctuation.py

parser = argparse.ArgumentParser()
parser.add_argument('inputFile',
                    default=sys.stdin,
                    type=argparse.FileType('r'),
                    nargs='?')

args = parser.parse_args()
textLines = args.inputFile.readlines()

tb_tokenizer = TreebankWordTokenizer()
wp_tokenizer = WordPunctTokenizer()

count = 0
for line in textLines:
    count += 1
    print("Treebank: ", tb_tokenizer.tokenize(line))
    print("WordPunct", wp_tokenizer.tokenize(line))
    "wouldn't": "would not",
    "aren't": "are not",
    "haven't": "have not",
    "doesn't": "does not",
    "didn't": "did not",
    "don't": "do not",
    "shouldn't": "should not",
    "wasn't": "was not",
    "weren't": "were not",
    "mightn't": "might not",
    "mustn't": "must not"
}
negation_pattern = re.compile(r'\b(' + '|'.join(negations_.keys()) + r')\b')

from nltk.tokenize import WordPunctTokenizer
tokenizer1 = WordPunctTokenizer()
tokenizer2 = WordPunctTokenizer()

corpus_summary = []
for i in range(0, 3000):
    stripped = re.sub(combined_pat, '', dataset2['summary'][i])
    stripped = re.sub(www_pat, '', stripped)
    cleantags = re.sub(html_tag, '', stripped)
    #lower_case = cleantags.lower()
    neg_handled = negation_pattern.sub(lambda x: negations_[x.group()],
                                       cleantags)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    tokens = tokenizer1.tokenize(letters_only)
    tokens = ' '.join(tokens)
    corpus_summary.append(tokens)
Exemple #50
0
import torch
import nltk
from nltk import tokenize
from nltk.tokenize import TweetTokenizer
import json
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
from tqdm import *
from collections import defaultdict
import operator
import random
from nltk.tokenize import WordPunctTokenizer
import h5py

wpt = WordPunctTokenizer()

min_context_len = 20
max_context_len = 350
min_question_len = 2
max_question_len = 30
max_answer_len = 30

def helper(data_path,voc_path, number_data = None):
    
    data = json.load(open(data_path))
    voc = json.load(open(voc_path))

    p_set = []
    p_len_set = []
    p_c_s_e_set = []
Exemple #51
0
def nltkSplit(testDt_List):
    seg_Dt1_List = []
    for i in range(0, len(testDt_List) - 1):
        seg_Dt1 = WordPunctTokenizer().tokenize(testDt_List[i])
        seg_Dt1_List.append(seg_Dt1)
    return seg_Dt1_List
Exemple #52
0
def train_Bayes():

    ripple = pd.read_table('ripple_train.csv', sep=',')
    btc = pd.read_table('btc_train.csv', sep=',')
    bitcoin = pd.read_table('bitcoin_train.csv', sep=',')
    cryptocurrency = pd.read_table('cryptocurrency_train.csv', sep=',')
    cryptomarkets = pd.read_table('cryptomarkets_train.csv', sep=',')
    ethereum = pd.read_table('ethereum_train.csv', sep=',')
    iota = pd.read_table('iota_train.csv', sep=',')
    litecoin = pd.read_table('litecoin_train.csv', sep=',')
    neo = pd.read_table('neo_train.csv', sep=',')
    stellar = pd.read_table('stellar_train.csv', sep=',')

    headlines = ripple['headline']
    headlines.append(btc['headline'])
    headlines.append(bitcoin['headline'])
    headlines.append(cryptocurrency['headline'])
    headlines.append(cryptomarkets['headline'])
    headlines.append(ethereum['headline'])
    headlines.append(iota['headline'])
    headlines.append(litecoin['headline'])
    headlines.append(neo['headline'])
    headlines.append(stellar['headline'])

    labels = ripple['label']
    labels.append(btc['label'])
    labels.append(bitcoin['label'])
    labels.append(cryptocurrency['label'])
    labels.append(cryptomarkets['label'])
    labels.append(ethereum['label'])
    labels.append(iota['label'])
    labels.append(litecoin['label'])
    labels.append(neo['label'])
    labels.append(stellar['label'])

    reformat = token_format(headlines)

    train = list(zip(reformat, labels))

    dictionary = set(word.lower() for passage in train
                     for word in WordPunctTokenizer().tokenize(passage[0]))

    print("First couple of titles and their associated values:")
    print(train[0])
    print(train[1])
    print(train[2])
    print(train[3])

    t = [({
        word: (word in WordPunctTokenizer().tokenize(x[0]))
        for word in dictionary
    }, x[1]) for x in train]

    classifier = nltk.NaiveBayesClassifier.train(t)

    model = open('bayes_model.pickle', 'wb')
    words = open('dictionary.pickle', 'wb')
    pickle.dump(classifier, model)
    pickle.dump(dictionary, words)
    model.close()
    words.close()
#coding=utf-8
import numpy as np
import json
import pickle
import nltk
from nltk.tokenize import WordPunctTokenizer
from collections import defaultdict

#使用nltk分词分句器
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#print (type(sent_tokenizer))
word_tokenizer = WordPunctTokenizer()
#print (type(word_tokenizer))

#记录每个单词及其出现的频率
word_freq = defaultdict(int)
#print (word_freq)

# 读取数据集,并进行分词,统计每个单词出现次数,保存在word freq中
with open('yelp_academic_dataset_review.json', 'rb') as f:
    for line in f:
        review = json.loads(line.decode('utf-8'))
        words = word_tokenizer.tokenize(review['text'])
        #print (type(words))  #list
        #print (len(words))  包含标点符号
        for word in words:
            word_freq[word] += 1
    #print (review)
    #print (type(review))
    print (word_freq[','])
    print (word_freq['.'])
Exemple #54
0
        yield eval(l)


def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


df = getDF('reviews_Musical_Instruments_5.json.gz')
df.columns
i = 0
word_punct_tokenizer = WordPunctTokenizer()
punct = list(string.punctuation)
stopword_list = stopwords.words('english') + punct + ['rt', 'via']

filt3 = []
uniq = []
i = 0
for index, rows in df.iterrows():
    reviews = rows['reviewText']
    reviews = reviews.lower()
    tokens = nltk.word_tokenize(reviews)
    tokens2 = word_punct_tokenizer.tokenize(reviews)
    filter = [
        word for word in tokens2 if word not in stopwords.words('english')
    ]
    post = nltk.pos_tag(filter)
parser.add_argument("--normquotes",
                    help="Normalize any quotes to quote single type.",
                    default=1)
parser.add_argument("--wptokenizer",
                    help="Additionally apply treebank tokenizer.",
                    default=1)

pa = parser.parse_args()
sentid = int(pa.sentid)
normquotes = int(pa.normquotes)
wptokenizer = int(pa.wptokenizer)

if __name__ == "__main__":

    st = PunktSentenceTokenizer()
    wtw = WordPunctTokenizer() if wptokenizer == 1 else None
    wtt = TreebankWordTokenizer()

    for line in sys.stdin:

        line = line.decode("utf-8")

        if sentid == 1:
            m = textid_re.search(line)
            if m:
                sys.stdout.write(u".\n{{{%s}}}!!!\n" % m.group(1))
                continue
            if line == "\n":
                continue

        if normquotes == 1:
def tokenize_text(text, punct=False):
    text = WordPunctTokenizer().tokenize(text)
    text = [word for word in text if punct or word.isalnum()]
    text = ' '.join(text)
    text = text.strip()
    return text
Exemple #57
0
def init_word_tokenizer():
    global word_tokenizer
    if word_tokenizer is None:
        word_tokenizer = WordPunctTokenizer()
Exemple #58
0
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer

parser = argparse.ArgumentParser()
parser.add_argument("--treebank",
                    help="Additionally apply treebank tokenizer.",
                    default=1)

pa = parser.parse_args()
treebank = int(pa.treebank)

if __name__ == "__main__":

    st = PunktSentenceTokenizer()
    wtw = TreebankWordTokenizer()
    wtt = WordPunctTokenizer()

    for line in sys.stdin:

        if line[0:7] == "TEXTID(":
            sys.stdout.write(line)
            continue

        if line == "\n":
            sys.stdout.write(line)
            continue

        if treebank == 0:
            line = line.replace("«", " ' ")
            line = line.replace("»", " ' ")
            line = line.replace("“", " ' ")
class SQUADSupportingFactsProcessor(JiantSupportingFactsProcessor):
    DOC_ID = "squad_sup_facts"

    word_tokenizer = WordPunctTokenizer()
    sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    def process_file(self) -> List:
        """
        Converts a SQuAD dataset file into samples for the Supporting Facts Probing task in Jiant format
        :return: A list of samples in jiant edge probing format.
        """
        squad_data = self.json_from_file(self.input_path)['data']
        samples = []

        for article in squad_data:
            pars = article["paragraphs"]
            for par in pars:
                context = par["context"]

                tokenized_context = self.word_tokenizer.tokenize(context)
                sentences = list(
                    self.sentence_tokenizer.tokenize(context.strip()))

                if len(
                        sentences
                ) < 2:  # There must be at least two sentences in the paragraph
                    continue

                for qa in par["qas"]:
                    targets = []
                    answer = qa["answers"][0]
                    question = qa["question"]
                    question_id = qa["id"]

                    tokenized_question = self.word_tokenizer.tokenize(question)
                    question_length = len(tokenized_question)
                    sample_text = " ".join(tokenized_question) + " "

                    answer_char_position = answer["answer_start"]
                    answer_sentence_index = self.get_sentence_index_from_char_position(
                        answer_char_position, sentences)

                    found_answer_sentence_in_context = False

                    # go through all sentences in context
                    for sentence_index, sentence in enumerate(sentences):

                        tokenized_sentence = self.word_tokenizer.tokenize(
                            sentence)
                        sample_text += " ".join(tokenized_sentence) + " "

                        # get token start position for sentence in context
                        sentence_pos = self.find_sentence_position_in_context(
                            tokenized_context, tokenized_sentence)

                        if sentence_pos is None:
                            continue

                        # define sentence token span for jiant target
                        start_index = sentence_pos + question_length
                        end_index = start_index + len(tokenized_sentence)
                        sentence_span = [start_index, end_index]

                        # if sentence contains answer, set label to "1"
                        if sentence_index == answer_sentence_index:
                            label = "1"
                            found_answer_sentence_in_context = True
                        else:
                            label = "0"

                        targets.append(
                            self.create_target(question_length, sentence_span,
                                               label))

                    if not found_answer_sentence_in_context:
                        # could not find answer in context, skip this example
                        continue

                    sample = {
                        "info": {
                            "doc_id": self.DOC_ID,
                            "q_id": question_id
                        },
                        "text": sample_text.strip(),
                        "targets": targets
                    }

                    samples.append(sample)

        return samples

    @staticmethod
    def find_sentence_position_in_context(context: List,
                                          sentence_tokens: List) -> int:
        """
        Goes through a list of context tokens and tries to find the sentence tokens. If sentence tokens are found, the
        start index is returned.

        :param context: List of tokens in a context document.
        :param sentence_tokens: List of tokens in a sentence, that is supposed to be within the context.
        :return: The start token position of the sentence in the context. If not found returns None.
        """
        for token_index, token in enumerate(context):
            # check if current token equals the first sentence token
            if token == sentence_tokens[0]:

                match = True
                # go through all sentence tokens to see if they match with the following context tokens
                for i in range(1, len(sentence_tokens)):
                    if len(context) > token_index + i and context[
                            token_index + i] == sentence_tokens[i]:
                        continue

                    match = False
                    break
                if match:
                    return token_index

    @staticmethod
    def get_sentence_index_from_char_position(char_pos: int,
                                              sentences: List) -> int:
        """
        Gets a list of sentences from a paragraph and returns the index of the sentence that contains a certain
        character position.
        :param char_pos: Character position in paragraph
        :param sentences: List of paragraph sentences
        :return: Index of the sentence that contains the character
        """
        char_count = 0
        for sentence_index, sentence in enumerate(sentences):
            char_count += len(sentence)
            if char_count >= char_pos:
                return sentence_index
Exemple #60
0
import re
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from utilites import dump, load
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import WordPunctTokenizer
from pymorphy2 import MorphAnalyzer

pst = PunktSentenceTokenizer()
wpt = WordPunctTokenizer()
ma = MorphAnalyzer()


def find_stop_words(text):
    rez = []
    for word in wpt.tokenize(text):
        tags = ma.parse(word)[0].tag
        if 'UNKN' in tags or \
           'LATN' in tags or \
           'PNCT' in tags or \
           'NUMB' in tags or \
           'ROMN' in tags:
            rez += [word]
    return rez


try:
    word_normal_form = load('word_normal_form.json')
except FileNotFoundError: