def search(file, query): # Setup a Solr instance. The timeout is optional. solr = pysolr.Solr('http://localhost:8983/solr/gettingstarted/', timeout=10) with open(file, "r") as f: text = f.read() paragraphs = blankline_tokenize(text.decode("utf8")) for i in range(len(paragraphs)): paragraphs[i] = sent_tokenize(paragraphs[i]) sentences = list(itertools.chain(*paragraphs)) del paragraphs # How you'd index data. for i in range(len(sentences)): index = {'id': str(i), "_text_": sentences[i]} solr.add([index]) # Note that the add method has commit=True by default, so this is # immediately committed to your index. # Later, searching is easy. In the simple case, just a plain Lucene-style # query is fine. results = solr.search(query) # The ``Results`` object stores total results found, by default the top # ten most relevant results and any additional data like # facets/highlighting/spelling/etc. print("Saw {0} result(s).".format(len(results))) # Just loop over it to access the results. for result in results: print("The title is '{0}'.".format(result['id'])) print(sentences[int(result['id'])])
def tokenize_pos_tag(phrase, lang_code="eng"): init_line_tokenizer() phrase = ".\n".join(blankline_tokenize( phrase)) # add period to blank lines to simulate "sentences" sents = sent_tokenize(phrase) sents_tokens = [word_tokenize(s) for s in sents] sents_tags = pos_tag_sents(sents_tokens, lang=lang_code) print("sents:", sents_tokens) print("tokens:", sents_tags) word_tags = [] for s in sents_tags: word_tags.extend(s) return word_tags
def index(fname, query, solr): solr.delete(q='*:*') with open(fname, "r") as f: text = f.read() paragraphs = blankline_tokenize(text.decode("utf8")) for i in range(len(paragraphs)): paragraphs[i] = sent_tokenize(paragraphs[i]) sentences = list(itertools.chain(*paragraphs)) del paragraphs for i in range(len(sentences)): index = {'id': str(i), "_text_": sentences[i]} solr.add([index]) return sentences
def some(): cont = request.form['cont'] cont_tokens = word_tokenize(cont) punctuation = re.compile(r'[-.?!,:;()|0-9]') post_punc = [] for i in cont_tokens: j = punctuation.sub("", i) if (len(j) > 0): post_punc.append(j) stop_words = stopwords.words('english') post_punc1 = [word for word in post_punc if word not in stop_words] size = len(post_punc1) pst = PorterStemmer() post_punc_stem = [] for i in post_punc1: post_punc_stem.append(pst.stem(i)) blank1 = blankline_tokenize(cont) lb = len(blank1) fdist = FreqDist() for i in post_punc_stem: fdist[i.lower()] += 1 df = pd.DataFrame(list(fdist.items()), columns=["Word", "Absolute Frequency"]) df['Relative Frequency'] = df["Absolute Frequency"] / size df1 = df.sort_values(by='Absolute Frequency', ascending=False) df1 = df1.head() f1 = plot([Bar(x=df1["Word"], y=df1["Absolute Frequency"])], output_type='div') f2 = plot([Bar(x=df["Word"], y=df["Absolute Frequency"])], output_type='div') return render_template("index.html", df=df.to_html(classes="table table-striped"), f1=Markup(f1), f2=Markup(f2), blank1=lb)
# ================================================================================ # ======================== Tokenization ========================================== # ================================================================================ x_token = word_tokenize(x) f = FreqDist() print(x_token) print("Number of tokens in the string: ", len(x_token)) for word in x_token: f[word.lower()] = f[word.lower()] + 1 print(f) print("The 10 most occuring tokens are:\n", f.most_common(10)) x_blank = blankline_tokenize(x) print("Number of blank lines within the string: ", len(x_blank)) x_bigrams = list(nltk.bigrams(x_token)) print(x_bigrams) x_trigrams = list(nltk.trigrams(x_token)) print(x_trigrams) x_ngrams = list(nltk.ngrams(x_token, 4)) print(x_ngrams) # ======================================================================= # ======================================================================== # ======================== Stemming ====================================== # ========================================================================
from nltk.tokenize import word_tokenize from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize s = "Hi Everyone ! hola gr8" ss = ''' Hi Everyone ! hola gr8 from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize ''' print(s.split()) # ['Hi', 'Everyone', '!', 'hola', 'gr8'] print(word_tokenize(s)) # ['Hi', 'Everyone', '!', 'hola', 'gr8'] print(regexp_tokenize(s, pattern='\w+')) # ['Hi', 'Everyone', 'hola', 'gr8'] print(regexp_tokenize(s, pattern='\d+')) # ['8'] print(wordpunct_tokenize(s)) # ['Hi', 'Everyone', '!', 'hola', 'gr8'] print(blankline_tokenize(s)) # ['Hi Everyone ! hola gr8'] print(blankline_tokenize(ss)) # ['\nHi Everyone ! hola gr8', 'from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize']
>>>tokenizer =nltk.tokenize.punkt.PunktSentenceTokenizer() # word tokenizer >>>s ="Hi Everyone ! hola gr8" # simplest tokenizer >>>print s.split() >>>from nltk.tokenize import word_tokenize >>>word_tokenize(s) >>>from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize >>>regexp_tokenize(s, pattern='\w+') >>>regexp_tokenize(s, pattern='\d+') >>>wordpunct_tokenize(s) >>>blankline_tokenize(s) #Porter stemmer >>>from nltk.stem import PorterStemmer # import Porter stemmer >>>from nltk.stem.lancaster import LancasterStemmer >>>from nltk.stem.Snowball import SnowballStemmer >>>pst=PorterStemmer() # create obj of the PorterStemmer >>>lst = LancasterStemmer() # create obj of LancasterStemmer >>>lst.stem("eating") >>>pst.stem("shopping") #Lemmatizer >>>from nltk.stem import WordNetLemmatizer >>>wlem=WordNetLemmatizer() >>>wlem.lemmatize("ate")
from nltk.tokenize import sent_tokenize with open('sentence1.txt', 'r') as myfile: data = myfile.read().replace('\n', '') sentences = sent_tokenize(data, language="german") for s in sentences: print(s) first_sentence = sentences[0] print(first_sentence.split()) from nltk.tokenize import word_tokenize, regexp_tokenize, wordpunct_tokenize, blankline_tokenize print(word_tokenize(first_sentence)) print(regexp_tokenize(first_sentence, pattern='\w+')) print(wordpunct_tokenize(first_sentence)) print(blankline_tokenize(first_sentence))
fdist[word.lower()]+=1 fdist # In[11]: fdist_top10 = fdist.most_common(10) ## frequency of first 10 words fdist_top10 # In[12]: from nltk.tokenize import blankline_tokenize AI_blank = blankline_tokenize(AI) len(AI_blank) # 1 indicates how many paragraphs we have separated by a new line. # ## Tokenization Types: # 1. Bigrams: Tokens of two consecutive written words # 2. Trigrams Tokens of three consecutive written words # 3. Ngrams: Tokens of any no. of consecutive written words # In[13]: from nltk.util import bigrams,trigrams,ngrams
sent = sent_tokenize(data) # word tokenize words = word_tokenize(data) from nltk.probability import FreqDist fdist = FreqDist() for word in words: fdist[word.lower()] += 1 print(fdist["space"]) fdishMost = fdist.most_common(10) print(fdishMost) from nltk.tokenize import blankline_tokenize AIblank = blankline_tokenize(data) print(len(AIblank)) print(AIblank[2]) from nltk.util import bigrams, trigrams, ngrams string = "i have to write any code on my own, cause of by this way it is not helpful." quarterToken = nltk.word_tokenize(string) print("<<<<<< word tokenize >>>>>\n ", quarterToken) biagram = list(nltk.bigrams(quarterToken)) print("<<<<<< biagrammm >>>>>>>\n", biagram) triagram = list(nltk.trigrams(quarterToken)) print("<<<<< trigram >>>>>>>\n", triagram) ngramm = list(nltk.ngrams(quarterToken, 4))
# -*- coding: utf-8 -*- """ Created on 2018/6/17 @author: Samuel @Desc: @dependence: Noting """ input_str = "Hi everyone! Hola gr8 &*$" print(input_str.split()) from nltk.tokenize import word_tokenize, regexp_tokenize, wordpunct_tokenize, blankline_tokenize output_str = word_tokenize(input_str) print('word_tokenize: ') print(output_str) output_str = regexp_tokenize(input_str, pattern='\w+') print('regexp_tokenize: ') print(output_str) output_str = regexp_tokenize(input_str, pattern='\d+') print('regexp_tokenize: ') print(output_str) output_str = wordpunct_tokenize(input_str) print('wordpunct_tokenize: ') print(output_str) output_str = blankline_tokenize(input_str) print('blankline_tokenize: ') print(output_str)
def main(): start_time = time.time() #stopwords_txt = set(open(config.general['stopwords']).read().split()) stopwords = set(nltk.corpus.stopwords.words('english')) stopwords.update(set(STOPWORDS)) pickle_book_file = config.general["pickle_path"] / "book_list.pickle" # try to load pickel book_list = jn.pickle_load(pickle_book_file) # if there is no pickels extract corpus if book_list == None: if not jn.create_dir(config.general["pickle_path"]): return 1 book_list = extract_books(config.general["corpus_path"]) # cut title, add paragraphs and sentences for book in book_list: book["original"]["tokens"] = word_tokenize(book["original"]["text"]) book["original"]["paragraphs"] = blankline_tokenize(book["original"]["text"]) book["original"]["sentences"] = sent_tokenize(book["original"]["text"]) # add tokens bigrams and trigrams for book in book_list: token_list, bigram_list, trigram_list = [], [], [] for sntc in book["original"]["sentences"]: tokens = word_tokenize(sntc) bigrams = list(nltk.bigrams(tokens)) trigrams = list(nltk.trigrams(tokens)) token_list.append(tokens) bigram_list.append(bigrams) trigram_list.append(trigrams) book["original"]["token_list"] = token_list book["original"]["bigram_list"] = bigram_list book["original"]["trigram_list"] = trigram_list #for word in book.lower().split(): #preprocessed text punctuation = re.compile(r'[.,?!:;()|0-9]') #- for book in book_list: preprocessed_sentences = [] preprocessed_token_list, preprocessed_bigram_list, preprocessed_trigram_list = [], [], [] cleaned_token_list = [] new_text = re.sub(r'[^\w\s]', '', book["original"]["text"]) preprocessed_text = new_text.lower() for sntc_tokenized in book["original"]["sentences"]: new_sentence = re.sub(r'[^\w\s]', '', sntc_tokenized) new_sentence = new_sentence.lower() new_tokens = word_tokenize(new_sentence) new_bigrams = list(nltk.bigrams(new_tokens)) new_trigrams = list(nltk.trigrams(new_tokens)) cleaned_tokens = [] for word in new_tokens: if word not in stopwords: cleaned_tokens.append(word) preprocessed_sentences.append(new_sentence) preprocessed_token_list.append(new_tokens) preprocessed_bigram_list.append(new_bigrams) preprocessed_trigram_list.append(new_trigrams) cleaned_token_list.append(cleaned_tokens) book["original"]["tokens"] = word_tokenize(book["original"]["text"]) book["preprocess"] = { "text": preprocessed_text, "tokens": word_tokenize(preprocessed_text), "sentences": preprocessed_sentences, "token_list": preprocessed_token_list, "brigram_list": preprocessed_bigram_list, "trigram_list": preprocessed_trigram_list, "cleaned_tokens": cleaned_token_list } # add word freq for book in book_list: fdist_original = FreqDist(word for word in word_tokenize(book["original"]["text"])) book["original"]["token_frequency"] = dict(fdist_original.items()) fdist_preprocess = FreqDist(word.lower() for word in word_tokenize(book["preprocess"]["text"])) book["preprocess"]["token_frequency"] = dict(fdist_preprocess.items()) jn.pickle_save(book_list, pickle_book_file) # TF.IDF pickle_tfidf_file = config.general["pickle_path"] / "tf_idf_dictionary.pickle" # try to load pickel tf_idf_dictionary = jn.pickle_load(pickle_tfidf_file) # if there is no pickels extract corpus if tf_idf_dictionary == None: token_set = [book["preprocess"]["tokens"] for book in book_list] tf_idf_dictionary = tf_idf.get_tf_idf(token_set) jn.pickle_save(tf_idf_dictionary, pickle_tfidf_file) # TF.IDF without stopwords pickle_tfidf_nsw_file = config.general["pickle_path"] / "tf_idf_dictionary_nsw.pickle" # try to load pickel tf_idf_dictionary_nsw = jn.pickle_load(pickle_tfidf_nsw_file) # if there is no pickels extract corpus if tf_idf_dictionary_nsw == None: token_set = [book["preprocess"]["tokens"] for book in book_list] token_set_nsw = [] for tokens in token_set: tokens_nsw = [] for word in tokens: if word not in stopwords: tokens_nsw.append(word) token_set_nsw.append(tokens_nsw) tf_idf_dictionary_nsw = tf_idf.get_tf_idf(token_set_nsw) jn.pickle_save(tf_idf_dictionary_nsw, pickle_tfidf_nsw_file) print("--- Preprocessing lasts %s seconds ---" % (time.time() - start_time)) atmom = book_list[56] dagon = book_list[1] asd = tf_idf_dictionary_nsw[1] asd = {k: v for k, v in asd.items() if v > 0.002} text = dagon["original"]["text"] nlp = spacy.load("en_core_web_sm") doc = nlp(text) displacy.serve(doc, style="ent") # return 1 tkn = atmom["preprocess"]["token_list"][0] # POS hey = nltk.pos_tag(tkn) # NER hoy = ne_chunk(hey) tkn2 = atmom["original"]["token_list"][0] # POS hey2 = nltk.pos_tag(tkn2) # NER hoy2 = ne_chunk(hey2) print(hey) print(hoy) print(hey2) print(hoy2) asd = 1 #pst = LancasterStemmer() #print(atmom["sentences"][0]) #print(pst.stem(atmom["sentences"][0])) q1 = "The big cat ate the little mouse who was after fresh cheese" nw_tk = nltk.pos_tag(word_tokenize(q1)) print(nw_tk) grammar_np = r"NP: {<DT>?<JJ>*<NN>}" chunk_parser = nltk.RegexpParser(grammar_np) chunk_result = chunk_parser.parse(nw_tk) print(chunk_result) return 1 #data = [ # [(word.replace(",", "") # .replace(".", "") # .replace("(", "") # .replace(")", "")) # for word in row[2].lower().split()] # for row in reader] ## Removes header #data = data[1:] all_sentences = "" all_preprocessed_sentences = "" for book in book_list: for sntc in book["original"]["sentences"]: all_sentences = all_sentences + "\n" + sntc for sntc in book["preprocess"]["sentences"]: all_preprocessed_sentences = all_preprocessed_sentences + "\n" + sntc print("There are {} words in the combination of all review.".format(len(all_sentences))) # Create and generate a word cloud image: #wordcloud = WordCloud().generate(text) #wordcloud = WordCloud(max_words=30, background_color="white", collocations=False).generate(text) #wordcloud.to_file("img/first_review.png") #plt.imshow(wordcloud, interpolation='bilinear') #plt.axis("off") #plt.show() wordcloud = WordCloud(stopwords=stopwords, max_words=50, background_color="white", collocations=False).generate(all_sentences) wordcloud.to_file("img/review.png") # Display the generated image: #plt.figure() plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() wordcloud = WordCloud(stopwords=stopwords, max_words=50, background_color="white", collocations=False).generate(all_preprocessed_sentences) wordcloud.to_file("img/refined_review.png") # Display the generated image: #plt.figure() plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
from nltk.tokenize import blankline_tokenize python_data = '''Python is an interpreted high-level programming language for general-purpose programming. Created by Guido van Rossum and first released in 1991, Python has a design philosophy that emphasizes code readability, notably using significant whitespace. It provides constructs that enable clear programming on both small and large scales.[27] In July 2018, Van Rossum stepped down as the leader in the language community after 30 years.[28][29] Python features a dynamic type system and automatic memory management. It supports multiple programming paradigms, including object-oriented, imperative, functional and procedural, and has a large and comprehensive standard library.[30] Python interpreters are available for many operating systems. CPython, the reference implementation of Python, is open source software[31] and has a community-based development model, as do nearly all of Python's other implementations. Python and CPython are managed by the non-profit Python Software Foundation''' python_tokens = blankline_tokenize(python_data) # Length of the blank line tokenize print(len(python_tokens)) # printing the data of the blank line tokenize for item in python_tokens: print(item)
) #ngrams from nltk import ngrams bigrams = ngrams(vocab_wo_punctuation, 2) print(list(bigrams)) print( "_____________________Regex tokenizer____________________________________") # different tokenize form from nltk import regexp_tokenize s2 = ("Alas, it has not rained today. When, do you think, will it rain again?") print(regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False)) print(regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True)) print(nltk.word_tokenize(s2)) s3 = ( "<p>Although this is <b>not</b> the case here, we must not relax our vigilance!</p>" ) print(regexp_tokenize(s3, r'</?(b|p)>', gaps=False)) print(regexp_tokenize(s3, r'</?(b|p)>', gaps=True)) s4 = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize print(regexp_tokenize(s4, pattern='\w+|\$[\d\.]+|\S+')) print(wordpunct_tokenize(s4)) print(blankline_tokenize(s4))
al = """The Tragedie of Hamlet by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Barnardo and Francisco two Centinels . Barnardo . Who ' s there ? Fran . Nay answer me : Stand & vnfold your selfe Bar . Long liue the King Fran . Barnardo ? Bar . He Fran . You come most carefully vpon your houre Bar . ' Tis now strook twelue , get thee to bed Francisco Fran . For this releefe much thankes : ' Tis bitter cold , And I am sicke at heart Barn . Haue you had quiet Guard ? Fran . Not a Mouse stirring Barn . Well , goodnight . If you do meet Horatio and Marcellus , the Riuals of my Watch , bid them make hast . Enter Horatio and Marcellus . Fran . I thinke I heare them . Stand : who ' s there ? Hor . Friends to this ground Mar . And Leige - men to the Dane Fran . Giue you good night Mar . O farwel honest Soldier , who hath relieu ' d you ? Fra . Barnardo ha ' s my place : giue you goodnight . Exit Fran . Mar . Holla Barnardo Bar . Say , what is Horatio there ? Hor . A peece of him Bar . Welcome Horatio , welcome good Marcellus Mar . What , ha ' s this thing appear ' d againe to night Bar . I haue seene nothing Mar . Horatio saies , ' tis but our Fantasie , And will not let beleefe take hold of him Touching this dreaded sight , twice seene of vs , Therefore I haue intreated him along With vs , to watch the minutes of this Night , That if againe this Apparition come , He may approue our eyes , and speake to it Hor . Tush , tush , ' twill not appeare Bar . Sit downe a - while , And let vs once againe assaile your eares , That are so fortified against our Story , What we two Nights haue seene Hor . Well , sit we downe , And let vs heare Barnardo speake of this Barn . Last night of all , When yond same Starre that ' s Westward from the Pole Had made his course t ' illume that part of Heauen Where now it burnes , Marcellus and my selfe , The Bell then beating one Mar . Peace , breake thee of : Enter the Ghost . Looke where it comes againe Barn . In the same figure , like the King that ' s dead Mar . Thou art a Scholler ; speake to it Horatio Barn . Lookes it not like the King ? Marke it Horatio Hora . Most like : It harrowes me with fear & wonder Barn . It would be spoke too Mar . Question it Horatio Hor . What art """ type(al) al_token = word_tokenize(al) len(al_token) from nltk.probability import FreqDist freqdist = FreqDist() for word in al_token: freqdist[word.lower()] += 1 freqdist fdist_top10 = freqdist.most_common(10) from nltk.tokenize import blankline_tokenize al_blank = blankline_tokenize(al) len(al_blank) from nltk.util import bigrams, trigrams, ngrams strings = "Marke it Horatio Hora . Most like : It harrowes me with fear & wonder Barn . It would be spoke too Mar . Question it Horatio Hor . What art " qutoes_token = nltk.word_tokenize(strings) qutoes_bigrams = list(nltk.bigrams(qutoes_token)) qutoes_trigrams = list(nltk.trigrams(qutoes_token)) qutoes_ngrams = list(nltk.ngrams(qutoes_token, 5)) #stemmer from nltk.stem import PorterStemmer ps = PorterStemmer() ps.stem("speaking") word_to_strem = ["give", "given", "gave"] for word in word_to_strem: print(word + ":" + ps.stem(word))
len(Ram1) from nltk.probability import FreqDist fdis=FreqDist() fdis for word in Ram1: fdist[word.lower()]+=1 fdist mostcommon=fdist.most_common(10) mostcommon from nltk.tokenize import blankline_tokenize Ram2=blankline_tokenize(Ram1) Ram2 from nltk.util import bigrams, trigrams, ngrams kavya='Kavya is born on 1996, and now she is working in capgemini' kavya1=nltk.word_tokenize(kavya) kavya1 len(kavya1) kavya2=list(nltk.bigrams(kavya1)) kavya2 kavya2=list(nltk.trigrams(kavya1)) kavya2
def paragraphs_get(text): from nltk.tokenize import blankline_tokenize return blankline_tokenize(text)
__author__ = 'Mohammed Shokr <*****@*****.**>' sent = "Hi Everyone ! How do you do ?" print("# Split() built-in string function") print (sent.split()) #-----------------------------------------------------# print("# word_tokenize") from nltk.tokenize import word_tokenize print (word_tokenize(sent)) #-----------------------------------------------------# from nltk.tokenize import regexp_tokenize, wordpunct_tokenize,blankline_tokenize print("# RegEx -> splite text by RegEx") print(regexp_tokenize(sent, pattern='\w+')) print("# wordpunct_tokenize : split text words") print(wordpunct_tokenize(sent)) print("# blankline_tokenize : split text lines") print(blankline_tokenize(sent))
'http://localhost:9998/') # print(result) content, metadata = result['content'], result['metadata'] # print(content) # print(file_path) # print(content) length = len(content) if content is not None else content fbar.set_description(f'{file_path}: {length}') # TYPE if content is None: texts = [''] elif text_type == 'full': texts = [content] elif text_type == 'parablank': texts = [] for p in blankline_tokenize(content): texts.append(p) elif text_type == 'paraline': texts = [] for p in line_tokenize(content): texts.append(p) else: raise NotImplementedError(text_type) # NORM if norm_type == 'stem': texts = [ ' '.join( snow.stem(x) for x in word_tokenize(y) if x.isalnum() and x.lower() not in stop) for y in texts ]
print("LENGTH OF THE TOKENS(words):", len(txt_tokens) ) #total number of items in the string is counted by the LEN method print("") print("THE NO.OF OCCURANCES OF WORD 'and':", fdist['and']) #no of occurances of a paarticular word EG:and print("") fdist_top5 = fdist.most_common(5) print("TOP 5 MOST USED WORDS or SYMBOLS:", fdist_top5) #the top most used words or symbols print("") from nltk.tokenize import blankline_tokenize txt_blank = blankline_tokenize(txt) print("NO.OF PARAGRAPHS:", len(txt_blank)) print("") #the no.of paragraphs are differentiated by BLANKLINES from nltk.util import bigrams, trigrams, ngrams txt_bigrams = list(nltk.bigrams(txt_tokens)) print("DOUBLE TOKENS:") for i in txt_bigrams: print(i) print("") #tokens are seperated uniquely in 1st method...now it is seperated DUALLY #FOR triple aand multi seperated TOKENS UNCOMMAND the below 10LINES