Esempio n. 1
0
def search(file, query):
    # Setup a Solr instance. The timeout is optional.
    solr = pysolr.Solr('http://localhost:8983/solr/gettingstarted/',
                       timeout=10)
    with open(file, "r") as f:
        text = f.read()
        paragraphs = blankline_tokenize(text.decode("utf8"))
        for i in range(len(paragraphs)):
            paragraphs[i] = sent_tokenize(paragraphs[i])
        sentences = list(itertools.chain(*paragraphs))
        del paragraphs
    # How you'd index data.
    for i in range(len(sentences)):
        index = {'id': str(i), "_text_": sentences[i]}
        solr.add([index])
    # Note that the add method has commit=True by default, so this is
    # immediately committed to your index.

    # Later, searching is easy. In the simple case, just a plain Lucene-style
    # query is fine.
    results = solr.search(query)
    # The ``Results`` object stores total results found, by default the top
    # ten most relevant results and any additional data like
    # facets/highlighting/spelling/etc.
    print("Saw {0} result(s).".format(len(results)))

    # Just loop over it to access the results.
    for result in results:
        print("The title is '{0}'.".format(result['id']))
        print(sentences[int(result['id'])])
Esempio n. 2
0
def tokenize_pos_tag(phrase, lang_code="eng"):
    init_line_tokenizer()
    phrase = ".\n".join(blankline_tokenize(
        phrase))  # add period to blank lines to simulate "sentences"
    sents = sent_tokenize(phrase)
    sents_tokens = [word_tokenize(s) for s in sents]
    sents_tags = pos_tag_sents(sents_tokens, lang=lang_code)
    print("sents:", sents_tokens)
    print("tokens:", sents_tags)
    word_tags = []
    for s in sents_tags:
        word_tags.extend(s)
    return word_tags
Esempio n. 3
0
def index(fname, query, solr):
    solr.delete(q='*:*')
    with open(fname, "r") as f:
        text = f.read()
        paragraphs = blankline_tokenize(text.decode("utf8"))
        for i in range(len(paragraphs)):
            paragraphs[i] = sent_tokenize(paragraphs[i])
        sentences = list(itertools.chain(*paragraphs))
        del paragraphs
    for i in range(len(sentences)):
        index = {'id': str(i), "_text_": sentences[i]}
        solr.add([index])
    return sentences
Esempio n. 4
0
def some():
    cont = request.form['cont']
    cont_tokens = word_tokenize(cont)

    punctuation = re.compile(r'[-.?!,:;()|0-9]')
    post_punc = []
    for i in cont_tokens:
        j = punctuation.sub("", i)
        if (len(j) > 0):
            post_punc.append(j)
    stop_words = stopwords.words('english')
    post_punc1 = [word for word in post_punc if word not in stop_words]
    size = len(post_punc1)

    pst = PorterStemmer()
    post_punc_stem = []
    for i in post_punc1:
        post_punc_stem.append(pst.stem(i))
    blank1 = blankline_tokenize(cont)
    lb = len(blank1)
    fdist = FreqDist()
    for i in post_punc_stem:
        fdist[i.lower()] += 1
        df = pd.DataFrame(list(fdist.items()),
                          columns=["Word", "Absolute Frequency"])
        df['Relative Frequency'] = df["Absolute Frequency"] / size
    df1 = df.sort_values(by='Absolute Frequency', ascending=False)
    df1 = df1.head()
    f1 = plot([Bar(x=df1["Word"], y=df1["Absolute Frequency"])],
              output_type='div')

    f2 = plot([Bar(x=df["Word"], y=df["Absolute Frequency"])],
              output_type='div')

    return render_template("index.html",
                           df=df.to_html(classes="table table-striped"),
                           f1=Markup(f1),
                           f2=Markup(f2),
                           blank1=lb)
Esempio n. 5
0
# ================================================================================
# ======================== Tokenization ==========================================
# ================================================================================
x_token = word_tokenize(x)
f = FreqDist()

print(x_token)
print("Number of tokens in the string: ", len(x_token))

for word in x_token:
    f[word.lower()] = f[word.lower()] + 1
print(f)
print("The 10 most occuring tokens are:\n", f.most_common(10))

x_blank = blankline_tokenize(x)
print("Number of blank lines within the string: ", len(x_blank))

x_bigrams = list(nltk.bigrams(x_token))
print(x_bigrams)

x_trigrams = list(nltk.trigrams(x_token))
print(x_trigrams)

x_ngrams = list(nltk.ngrams(x_token, 4))
print(x_ngrams)
# =======================================================================

# ========================================================================
# ======================== Stemming ======================================
# ========================================================================
Esempio n. 6
0
from nltk.tokenize import word_tokenize
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize

s = "Hi Everyone ! hola gr8"
ss = '''
Hi Everyone ! hola gr8

from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize

'''

print(s.split()) # ['Hi', 'Everyone', '!', 'hola', 'gr8']

print(word_tokenize(s)) # ['Hi', 'Everyone', '!', 'hola', 'gr8']

print(regexp_tokenize(s, pattern='\w+')) # ['Hi', 'Everyone', 'hola', 'gr8']

print(regexp_tokenize(s, pattern='\d+')) # ['8']

print(wordpunct_tokenize(s)) # ['Hi', 'Everyone', '!', 'hola', 'gr8']

print(blankline_tokenize(s)) # ['Hi Everyone ! hola gr8']

print(blankline_tokenize(ss))
# ['\nHi Everyone ! hola gr8', 'from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize']

Esempio n. 7
0
>>>tokenizer =nltk.tokenize.punkt.PunktSentenceTokenizer()

# word tokenizer
>>>s ="Hi Everyone !    hola gr8" # simplest tokenizer
>>>print s.split()

>>>from nltk.tokenize import word_tokenize
>>>word_tokenize(s)

>>>from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
>>>regexp_tokenize(s, pattern='\w+')

>>>regexp_tokenize(s, pattern='\d+')

>>>wordpunct_tokenize(s)
>>>blankline_tokenize(s)

#Porter stemmer
>>>from nltk.stem import PorterStemmer # import Porter stemmer
>>>from nltk.stem.lancaster import LancasterStemmer
>>>from nltk.stem.Snowball import SnowballStemmer
>>>pst=PorterStemmer()   # create obj of the PorterStemmer
>>>lst = LancasterStemmer() # create obj of LancasterStemmer
>>>lst.stem("eating")
>>>pst.stem("shopping")

#Lemmatizer
>>>from nltk.stem import WordNetLemmatizer
>>>wlem=WordNetLemmatizer()
>>>wlem.lemmatize("ate")
from nltk.tokenize import sent_tokenize

with open('sentence1.txt', 'r') as myfile:
    data = myfile.read().replace('\n', '')

sentences = sent_tokenize(data, language="german")

for s in sentences:
    print(s)

first_sentence = sentences[0]

print(first_sentence.split())

from nltk.tokenize import word_tokenize, regexp_tokenize, wordpunct_tokenize, blankline_tokenize

print(word_tokenize(first_sentence))

print(regexp_tokenize(first_sentence, pattern='\w+'))

print(wordpunct_tokenize(first_sentence))

print(blankline_tokenize(first_sentence))
Esempio n. 9
0
    fdist[word.lower()]+=1
fdist


# In[11]:


fdist_top10 = fdist.most_common(10) ## frequency of first 10 words
fdist_top10


# In[12]:


from nltk.tokenize import blankline_tokenize
AI_blank = blankline_tokenize(AI)
len(AI_blank)


# 1 indicates how many paragraphs we have separated by a new line.

# ## Tokenization Types:
# 1. Bigrams: Tokens of two consecutive written words
# 2. Trigrams Tokens of three consecutive written words
# 3. Ngrams: Tokens of any no. of consecutive written words

# In[13]:


from nltk.util import bigrams,trigrams,ngrams
Esempio n. 10
0
sent = sent_tokenize(data)
# word tokenize
words = word_tokenize(data)

from nltk.probability import FreqDist
fdist = FreqDist()

for word in words:
    fdist[word.lower()] += 1
print(fdist["space"])

fdishMost = fdist.most_common(10)
print(fdishMost)

from nltk.tokenize import blankline_tokenize
AIblank = blankline_tokenize(data)
print(len(AIblank))
print(AIblank[2])

from nltk.util import bigrams, trigrams, ngrams
string = "i have to write any code on my own, cause of by this way it is not helpful."
quarterToken = nltk.word_tokenize(string)
print("<<<<<< word tokenize >>>>>\n ", quarterToken)

biagram = list(nltk.bigrams(quarterToken))
print("<<<<<< biagrammm >>>>>>>\n", biagram)

triagram = list(nltk.trigrams(quarterToken))
print("<<<<< trigram >>>>>>>\n", triagram)

ngramm = list(nltk.ngrams(quarterToken, 4))
Esempio n. 11
0
# -*- coding: utf-8 -*-
"""
Created on 2018/6/17

@author: Samuel
@Desc: 
@dependence: Noting
"""
input_str = "Hi everyone! Hola gr8 &*$"
print(input_str.split())
from nltk.tokenize import word_tokenize, regexp_tokenize, wordpunct_tokenize, blankline_tokenize
output_str = word_tokenize(input_str)
print('word_tokenize: ')
print(output_str)
output_str = regexp_tokenize(input_str, pattern='\w+')
print('regexp_tokenize: ')
print(output_str)
output_str = regexp_tokenize(input_str, pattern='\d+')
print('regexp_tokenize: ')
print(output_str)
output_str = wordpunct_tokenize(input_str)
print('wordpunct_tokenize: ')
print(output_str)
output_str = blankline_tokenize(input_str)
print('blankline_tokenize: ')
print(output_str)
Esempio n. 12
0
def main():

    start_time = time.time()

    #stopwords_txt = set(open(config.general['stopwords']).read().split())
    stopwords = set(nltk.corpus.stopwords.words('english'))
    stopwords.update(set(STOPWORDS))

    pickle_book_file = config.general["pickle_path"] / "book_list.pickle"

    # try to load pickel
    book_list = jn.pickle_load(pickle_book_file)

    # if there is no pickels extract corpus
    if book_list == None:
        if not jn.create_dir(config.general["pickle_path"]):
            return 1

        book_list = extract_books(config.general["corpus_path"])

        # cut title, add paragraphs and sentences
        for book in book_list:
            book["original"]["tokens"] = word_tokenize(book["original"]["text"])
            book["original"]["paragraphs"] = blankline_tokenize(book["original"]["text"])
            book["original"]["sentences"] = sent_tokenize(book["original"]["text"])

        # add tokens bigrams and trigrams
        for book in book_list:
            token_list, bigram_list, trigram_list = [], [], []
            for sntc in book["original"]["sentences"]:
                tokens = word_tokenize(sntc)
                bigrams = list(nltk.bigrams(tokens))
                trigrams = list(nltk.trigrams(tokens))
                token_list.append(tokens)
                bigram_list.append(bigrams)
                trigram_list.append(trigrams)

            book["original"]["token_list"] = token_list
            book["original"]["bigram_list"] = bigram_list
            book["original"]["trigram_list"] = trigram_list

        #for word in book.lower().split():
        #preprocessed text
        punctuation = re.compile(r'[.,?!:;()|0-9]') #-
        for book in book_list:
            preprocessed_sentences = []
            preprocessed_token_list, preprocessed_bigram_list, preprocessed_trigram_list = [], [], []
            cleaned_token_list = []

            new_text = re.sub(r'[^\w\s]', '', book["original"]["text"])
            preprocessed_text = new_text.lower()

            for sntc_tokenized in book["original"]["sentences"]:
                new_sentence = re.sub(r'[^\w\s]', '', sntc_tokenized)
                new_sentence = new_sentence.lower()

                new_tokens = word_tokenize(new_sentence)
                new_bigrams = list(nltk.bigrams(new_tokens))
                new_trigrams = list(nltk.trigrams(new_tokens))

                cleaned_tokens = []
                for word in new_tokens:
                    if word not in stopwords:
                        cleaned_tokens.append(word)

                preprocessed_sentences.append(new_sentence)
                preprocessed_token_list.append(new_tokens)
                preprocessed_bigram_list.append(new_bigrams)
                preprocessed_trigram_list.append(new_trigrams)
                cleaned_token_list.append(cleaned_tokens)
            book["original"]["tokens"] = word_tokenize(book["original"]["text"])
            book["preprocess"] = {
                "text": preprocessed_text,
                "tokens": word_tokenize(preprocessed_text),
                "sentences": preprocessed_sentences,
                "token_list": preprocessed_token_list,
                "brigram_list": preprocessed_bigram_list,
                "trigram_list": preprocessed_trigram_list,
                "cleaned_tokens": cleaned_token_list
            }

        # add word freq

        for book in book_list:
            fdist_original = FreqDist(word for word in word_tokenize(book["original"]["text"]))
            book["original"]["token_frequency"] = dict(fdist_original.items())

            fdist_preprocess = FreqDist(word.lower() for word in word_tokenize(book["preprocess"]["text"]))
            book["preprocess"]["token_frequency"] = dict(fdist_preprocess.items())



        jn.pickle_save(book_list, pickle_book_file)

    # TF.IDF
    pickle_tfidf_file = config.general["pickle_path"] / "tf_idf_dictionary.pickle"
    # try to load pickel
    tf_idf_dictionary = jn.pickle_load(pickle_tfidf_file)

    # if there is no pickels extract corpus
    if tf_idf_dictionary == None:

        token_set = [book["preprocess"]["tokens"] for book in book_list]
        tf_idf_dictionary = tf_idf.get_tf_idf(token_set)
        jn.pickle_save(tf_idf_dictionary, pickle_tfidf_file)

    # TF.IDF without stopwords
    pickle_tfidf_nsw_file = config.general["pickle_path"] / "tf_idf_dictionary_nsw.pickle"
    # try to load pickel
    tf_idf_dictionary_nsw = jn.pickle_load(pickle_tfidf_nsw_file)

    # if there is no pickels extract corpus
    if tf_idf_dictionary_nsw == None:

        token_set = [book["preprocess"]["tokens"] for book in book_list]

        token_set_nsw = []
        for tokens in token_set:
            tokens_nsw = []
            for word in tokens:
                if word not in stopwords:
                    tokens_nsw.append(word)
            token_set_nsw.append(tokens_nsw)

        tf_idf_dictionary_nsw = tf_idf.get_tf_idf(token_set_nsw)
        jn.pickle_save(tf_idf_dictionary_nsw, pickle_tfidf_nsw_file)

    print("--- Preprocessing lasts %s seconds ---" % (time.time() - start_time))



    atmom = book_list[56]
    dagon = book_list[1]

    asd = tf_idf_dictionary_nsw[1]

    asd = {k: v for k, v in asd.items() if v > 0.002}

    text = dagon["original"]["text"]
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    displacy.serve(doc, style="ent")  #
    return 1
    tkn = atmom["preprocess"]["token_list"][0]
    # POS
    hey = nltk.pos_tag(tkn)
    # NER
    hoy = ne_chunk(hey)

    tkn2 = atmom["original"]["token_list"][0]
    # POS
    hey2 = nltk.pos_tag(tkn2)
    # NER
    hoy2 = ne_chunk(hey2)

    print(hey)
    print(hoy)
    print(hey2)
    print(hoy2)
    asd = 1
    #pst = LancasterStemmer()
    #print(atmom["sentences"][0])
    #print(pst.stem(atmom["sentences"][0]))

    q1 = "The big cat ate the little mouse who was after fresh cheese"
    nw_tk = nltk.pos_tag(word_tokenize(q1))
    print(nw_tk)

    grammar_np = r"NP: {<DT>?<JJ>*<NN>}"
    chunk_parser = nltk.RegexpParser(grammar_np)
    chunk_result = chunk_parser.parse(nw_tk)
    print(chunk_result)
    return 1
    #data = [
    #    [(word.replace(",", "")
    #      .replace(".", "")
    #      .replace("(", "")
    #      .replace(")", ""))
    #     for word in row[2].lower().split()]
    #    for row in reader]

    ## Removes header
    #data = data[1:]


    all_sentences = ""
    all_preprocessed_sentences = ""
    for book in book_list:
        for sntc in book["original"]["sentences"]:
            all_sentences = all_sentences + "\n" + sntc

        for sntc in book["preprocess"]["sentences"]:
            all_preprocessed_sentences = all_preprocessed_sentences + "\n" + sntc


    print("There are {} words in the combination of all review.".format(len(all_sentences)))

    # Create and generate a word cloud image:
    #wordcloud = WordCloud().generate(text)
    #wordcloud = WordCloud(max_words=30, background_color="white", collocations=False).generate(text)

    #wordcloud.to_file("img/first_review.png")

    #plt.imshow(wordcloud, interpolation='bilinear')
    #plt.axis("off")
    #plt.show()

    wordcloud = WordCloud(stopwords=stopwords, max_words=50, background_color="white", collocations=False).generate(all_sentences)

    wordcloud.to_file("img/review.png")

    # Display the generated image:
    #plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()


    wordcloud = WordCloud(stopwords=stopwords, max_words=50, background_color="white", collocations=False).generate(all_preprocessed_sentences)

    wordcloud.to_file("img/refined_review.png")

    # Display the generated image:
    #plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
from nltk.tokenize import blankline_tokenize

python_data = '''Python is an interpreted high-level programming language for general-purpose programming. Created by Guido van Rossum and first released in 1991, Python has a design philosophy that emphasizes code readability, notably using significant whitespace. It provides constructs that enable clear programming on both small and large scales.[27] In July 2018, Van Rossum stepped down as the leader in the language community after 30 years.[28][29]

Python features a dynamic type system and automatic memory management. It supports multiple programming paradigms, including object-oriented, imperative, functional and procedural, and has a large and comprehensive standard library.[30]

Python interpreters are available for many operating systems. CPython, the reference implementation of Python, is open source software[31] and has a community-based development model, as do nearly all of Python's other implementations. Python and CPython are managed by the non-profit Python Software Foundation'''

python_tokens = blankline_tokenize(python_data)

# Length of the blank line tokenize

print(len(python_tokens))

# printing the data of the blank line tokenize

for item in python_tokens:
    print(item)
Esempio n. 14
0
)

#ngrams
from nltk import ngrams

bigrams = ngrams(vocab_wo_punctuation, 2)
print(list(bigrams))

print(
    "_____________________Regex tokenizer____________________________________")
# different tokenize form

from nltk import regexp_tokenize
s2 = ("Alas, it has not rained today. When, do you think, will it rain again?")
print(regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False))
print(regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True))
print(nltk.word_tokenize(s2))

s3 = (
    "<p>Although this is <b>not</b> the case here, we must not relax our vigilance!</p>"
)
print(regexp_tokenize(s3, r'</?(b|p)>', gaps=False))
print(regexp_tokenize(s3, r'</?(b|p)>', gaps=True))

s4 = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize

print(regexp_tokenize(s4, pattern='\w+|\$[\d\.]+|\S+'))
print(wordpunct_tokenize(s4))
print(blankline_tokenize(s4))
al = """The Tragedie of Hamlet by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Barnardo and Francisco two Centinels . Barnardo . Who ' s there ? Fran . Nay answer me : Stand & vnfold your selfe Bar . Long liue the King Fran . Barnardo ? Bar . He Fran . You come most carefully vpon your houre Bar . ' Tis now strook twelue , get thee to bed Francisco Fran . For this releefe much thankes : ' Tis bitter cold , And I am sicke at heart Barn . Haue you had quiet Guard ? Fran . Not a Mouse stirring Barn . Well , goodnight . If you do meet Horatio and Marcellus , the Riuals of my Watch , bid them make hast . Enter Horatio and Marcellus . Fran . I thinke I heare them . Stand : who ' s there ? Hor . Friends to this ground Mar . And Leige - men to the Dane Fran . Giue you good night Mar . O farwel honest Soldier , who hath relieu ' d you ? Fra . Barnardo ha ' s my place : giue you goodnight . Exit Fran . Mar . Holla Barnardo Bar . Say , what is Horatio there ? Hor . A peece of him Bar . Welcome Horatio , welcome good Marcellus Mar . What , ha ' s this thing appear ' d againe to night Bar . I haue seene nothing Mar . Horatio saies , ' tis but our Fantasie , And will not let beleefe take hold of him Touching this dreaded sight , twice seene of vs , Therefore I haue intreated him along With vs , to watch the minutes of this Night , That if againe this Apparition come , He may approue our eyes , and speake to it Hor . Tush , tush , ' twill not appeare Bar . Sit downe a - while , And let vs once againe assaile your eares , That are so fortified against our Story , What we two Nights haue seene Hor . Well , sit we downe , And let vs heare Barnardo speake of this Barn . Last night of all , When yond same Starre that ' s Westward from the Pole Had made his course t ' illume that part of Heauen Where now it burnes , Marcellus and my selfe , The Bell then beating one Mar . Peace , breake thee of : Enter the Ghost . Looke where it comes againe Barn . In the same figure , like the King that ' s dead Mar . Thou art a Scholler ; speake to it Horatio Barn . Lookes it not like the King ? Marke it Horatio Hora . Most like : It harrowes me with fear & wonder Barn . It would be spoke too Mar . Question it Horatio Hor . What art """
type(al)
al_token = word_tokenize(al)
len(al_token)
from nltk.probability import FreqDist
freqdist = FreqDist()

for word in al_token:
    freqdist[word.lower()] += 1

freqdist

fdist_top10 = freqdist.most_common(10)
from nltk.tokenize import blankline_tokenize
al_blank = blankline_tokenize(al)
len(al_blank)
from nltk.util import bigrams, trigrams, ngrams
strings = "Marke it Horatio Hora . Most like : It harrowes me with fear & wonder Barn . It would be spoke too Mar . Question it Horatio Hor . What art "
qutoes_token = nltk.word_tokenize(strings)
qutoes_bigrams = list(nltk.bigrams(qutoes_token))
qutoes_trigrams = list(nltk.trigrams(qutoes_token))
qutoes_ngrams = list(nltk.ngrams(qutoes_token, 5))
#stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
ps.stem("speaking")
word_to_strem = ["give", "given", "gave"]
for word in word_to_strem:
    print(word + ":" + ps.stem(word))
Esempio n. 16
0
len(Ram1)

from nltk.probability import FreqDist
fdis=FreqDist()
fdis

for word in Ram1:
    fdist[word.lower()]+=1
fdist


mostcommon=fdist.most_common(10)
mostcommon

from nltk.tokenize import blankline_tokenize
Ram2=blankline_tokenize(Ram1)
Ram2

from nltk.util import bigrams, trigrams, ngrams

kavya='Kavya is born on 1996, and now she is working in capgemini'
kavya1=nltk.word_tokenize(kavya)
kavya1

len(kavya1)

kavya2=list(nltk.bigrams(kavya1))
kavya2

kavya2=list(nltk.trigrams(kavya1))
kavya2
Esempio n. 17
0
 def paragraphs_get(text):
     from nltk.tokenize import blankline_tokenize
     return blankline_tokenize(text)
Esempio n. 18
0
__author__ = 'Mohammed Shokr <*****@*****.**>'

sent = "Hi Everyone !  How do you do ?"

print("# Split() built-in string function")
print (sent.split())

#-----------------------------------------------------#

print("# word_tokenize")
from nltk.tokenize import word_tokenize
print (word_tokenize(sent))

#-----------------------------------------------------#

from nltk.tokenize import regexp_tokenize, wordpunct_tokenize,blankline_tokenize

print("# RegEx -> splite text by RegEx")
print(regexp_tokenize(sent, pattern='\w+'))

print("# wordpunct_tokenize : split text words")
print(wordpunct_tokenize(sent))

print("# blankline_tokenize : split text lines")
print(blankline_tokenize(sent))
Esempio n. 19
0
                                'http://localhost:9998/')
 # print(result)
 content, metadata = result['content'], result['metadata']
 # print(content)
 # print(file_path)
 # print(content)
 length = len(content) if content is not None else content
 fbar.set_description(f'{file_path}: {length}')
 # TYPE
 if content is None:
     texts = ['']
 elif text_type == 'full':
     texts = [content]
 elif text_type == 'parablank':
     texts = []
     for p in blankline_tokenize(content):
         texts.append(p)
 elif text_type == 'paraline':
     texts = []
     for p in line_tokenize(content):
         texts.append(p)
 else:
     raise NotImplementedError(text_type)
 # NORM
 if norm_type == 'stem':
     texts = [
         ' '.join(
             snow.stem(x) for x in word_tokenize(y)
             if x.isalnum() and x.lower() not in stop)
         for y in texts
     ]
Esempio n. 20
0
print("LENGTH OF THE TOKENS(words):", len(txt_tokens)
      )  #total number of items in the string is counted by the LEN method
print("")

print("THE NO.OF OCCURANCES OF WORD 'and':",
      fdist['and'])  #no of occurances of a paarticular word EG:and
print("")

fdist_top5 = fdist.most_common(5)
print("TOP 5 MOST USED WORDS or SYMBOLS:",
      fdist_top5)  #the top most used words or symbols
print("")

from nltk.tokenize import blankline_tokenize
txt_blank = blankline_tokenize(txt)
print("NO.OF PARAGRAPHS:", len(txt_blank))
print("")
#the no.of paragraphs are differentiated by BLANKLINES

from nltk.util import bigrams, trigrams, ngrams

txt_bigrams = list(nltk.bigrams(txt_tokens))
print("DOUBLE TOKENS:")
for i in txt_bigrams:
    print(i)
print("")
#tokens are seperated uniquely in 1st method...now it is seperated DUALLY

#FOR triple aand multi seperated TOKENS UNCOMMAND the below 10LINES