Beispiel #1
0
my_articles = main_functions.read_from_file("JSON_Files/response.json")

str1 = ""

for i in my_articles["results"]:
    str1 = str1 + i["abstract"]

words = word_tokenize(str1)

words_no_punc = []
for w in words:
    if w.isalpha():
        words_no_punc.append(w.lower())

fdist2 = FreqDist(words_no_punc)

stopwords = stopwords.words("english")

clean_words = []

for x in words_no_punc:
    if x not in stopwords:
        clean_words.append(x)

pprint(len(clean_words))

fdist3 = FreqDist(clean_words)

st.title("II - Frequency Distribution")
 def classify(self, feats):
     counts = FreqDist()
     for classifier in self._classifiers:
         counts[classifier.classify(feats)] += 1
     return counts.max()
Beispiel #3
0
 def _ngram_freqdist(words, n):
     return FreqDist(tuple(words[i:i + n]) for i in range(len(words) - 1))
Beispiel #4
0
    #cleaning

for idx in range(len(words)):
    words[idx] = words[idx].lower()
    words[idx] = fix_contraction(words[idx])
    words[idx] = re.sub(r'\s+', ' ', words[idx])  # remove newline chars
    words[idx] = re.sub(r"\'", "", words[idx])  # remove single quotes
    words[idx] = re.sub(r'\S*_\S*\s?', '', words[idx]) #removes underscores
    words[idx] = re.sub(r'\S*-\S*\s?', '', words[idx]) #removes dashes
    words[idx] = remove_punctuation(words[idx]) #removes all of the other symbols


words1 = [word for line in words for word in line.split()]

#first get word counts using nltk package
fdist = FreqDist(words1)
fdist
#convert into dictionary
wordDict = dict(fdist)


dict1 = {}
counter = 0
# Index Dictionary Mapping
for key, value in wordDict.items():
    dict1[counter] = key
    counter += 1
# Converting words1 and tokenizing it for bigram function NLTK package usage
wordsNew = copy.deepcopy(words1)
w3 = ' '.join(wordsNew)
token = nltk.word_tokenize(w3)
Beispiel #5
0
import nltk.corpus
nltk.download('all')
AI = 'This video will provide you with a comprehensive and detailed knowledge of Natural Language Processing, popularly known as NLP. You will also learn about the different steps involved in processing the human language like Tokenization, Stemming, Lemmatization and more. Python, NLTK, & Jupyter Notebook are used to demonstrate the concepts'

type(AI)

from nltk.tokenize import word_tokenize  # To create tokens

AI_token = word_tokenize(AI)
AI_token

len(AI_token)  #To find the length of tokens

from nltk.probability import FreqDist  #To check how many times tokens are repeated

fdist = FreqDist()

for word in AI_token:
    fdist[word.lower()] += 1
fdist

fdist['the']

len(fdist)

fdist_top20 = fdist.most_common(5)
fdist_top20

from nltk.tokenize import blankline_tokenize
AI_blankline = blankline_tokenize(AI)
AI_blankline
Beispiel #6
0
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords

import string

corpus_root = "abstracts"
wordlists = PlaintextCorpusReader(corpus_root, '.*')

all_words_list = []
for fid in wordlists.fileids():
    try:
        all_words_list += list(wordlists.words(fid))
    except Exception as e:
        print e

fd = FreqDist(Text([w.lower() for w in all_words_list]))

vocabulary = fd.keys()
clean_vocabulary = [
    v for v in vocabulary
    if v not in stopwords.words("english") and v not in string.punctuation
]

print clean_vocabulary[:50]

# TODOs:
# 1. Take care of non-meaningful words, like "1", ").", etc.
# 2. In the whole vocabulary, there are some words like "\x00", why?
Beispiel #7
0
print(sentencas)
palavras = word_tokenize(texto.lower())
print(palavras)

from nltk.corpus import stopwords
from string import punctuation

stopwords = set(stopwords.words('portuguese') + list(punctuation))
palavras_sem_stopwords = [
    palavra for palavra in palavras if palavra not in stopwords
]
print(palavras_sem_stopwords)

from nltk.probability import FreqDist

frequencia = FreqDist(palavras_sem_stopwords)
for fr in frequencia:
    print(fr)

print('------------------------------\n')
print(frequencia.most_common(50))

from collections import defaultdict

sentencas_importantes = defaultdict(int)

for i, sentenca in enumerate(sentencas):
    for palavra in word_tokenize(sentenca.lower()):
        if palavra in frequencia:
            sentencas_importantes[i] += frequencia[palavra]
Beispiel #8
0
from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(essays1.essay.iloc[0])
# print(tokenized_word)

from nltk.probability import FreqDist
fdist = FreqDist(tokenized_word)
fdist.most_common(2)

# Frequency Distribution Plot
fdist.plot(30, cumulative=False)
plt.show()

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

from nltk.tokenize import sent_tokenize

tokenized_sent = sent_tokenize(essays1.essay.iloc[0])

filtered_sent = []
for w in tokenized_sent:
    if w not in stop_words:
        filtered_sent.append(w)

# filtered_sent

y0 = df.loc[df['Department Name'] == 'Tops']['polarity']
y1 = df.loc[df['Department Name'] == 'Dresses']['polarity']
y2 = df.loc[df['Department Name'] == 'Bottoms']['polarity']
y3 = df.loc[df['Department Name'] == 'Intimate']['polarity']
y4 = df.loc[df['Department Name'] == 'Jackets']['polarity']
Beispiel #9
0
    def getTokens(self):

        if self.scoreLowerLimit:
            limits = [self.seoDocumentLimit]
        else:
            limits = np.arange(5, self.seoDocumentLimit + 1, 5)
        blocks = ['title', 'uri']

        resultBigrams = {}
        resultTrigrams = {}

        for block in blocks:

            bestMandatoryTrigrams = {}
            bestMandatoryBigrams = {}

            for limit in limits:

                lowerLimit = self.scoreLowerLimit if self.scoreLowerLimit else max(
                    settings.MANDATORY_TOKEN_QUANTITY_LOWER_LIMIT,
                    int(limit * settings.MANDATORY_LOWER_LIMIT))

                if limit <= self.seoDocumentLimit:
                    trigrams = []
                    bigrams = []

                    for seoDocument in self.seoLibrary.seoDocuments[0:limit]:
                        try:
                            parsed_uri = urlparse(seoDocument.link)
                            domain = u'{uri.netloc}'.format(uri=parsed_uri)
                            tokenList = _getMandatoryBlockTokens(seoDocument,
                                                                 u'%sTokens' %
                                                                 block,
                                                                 unique=False)
                            trigrams.extend(
                                self._getTrigrams(domain, tokenList))
                            bigrams.extend(self._getBigrams(domain, tokenList))
                        except Exception as ex:
                            print(u'ERROR - %s - %s' % (seoDocument.link, ex))

                    if self.uniqueDomains:
                        trigrams = self._filterByDomain(trigrams)
                        bigrams = self._filterByDomain(bigrams)
                    else:
                        trigrams = [token for token, _domain in trigrams]
                        bigrams = [token for token, _domain in bigrams]

                    fdist = FreqDist(trigrams)
                    for token, score in fdist.most_common(
                            settings.MANDATORY_MOST_COMMON_LIMIT):
                        if score >= lowerLimit:
                            scoreNormalized = int(score * 100.00 / limit)
                            if token not in bestMandatoryTrigrams:
                                bestMandatoryTrigrams[token] = scoreNormalized
                            else:
                                if scoreNormalized > bestMandatoryTrigrams[
                                        token]:
                                    bestMandatoryTrigrams[
                                        token] = scoreNormalized

                    fdist = FreqDist(bigrams)
                    for token, score in fdist.most_common(
                            settings.MANDATORY_MOST_COMMON_LIMIT):
                        if score >= lowerLimit:
                            scoreNormalized = int(score * 100.00 / limit)
                            if token not in bestMandatoryBigrams:
                                bestMandatoryBigrams[token] = scoreNormalized
                                #bestMandatoryTrigrams[token] = (scoreNormalized, mostCommonLimit)
                            else:
                                if scoreNormalized > bestMandatoryBigrams[
                                        token]:
                                    bestMandatoryBigrams[
                                        token] = scoreNormalized
                                    #bestMandatoryTrigrams[token] = (scoreNormalized, mostCommonLimit)

            resultBigrams.update(bestMandatoryBigrams)
            resultTrigrams.update(bestMandatoryTrigrams)

        result = {}
        for token, score in resultBigrams.items():
            found = False
            for trigramToken in resultTrigrams.keys():
                if token in trigramToken:
                    found = True
                    break
            if not found:
                result[token] = score

        result.update(resultTrigrams)

        return result
Beispiel #10
0
from nltk.tokenize import word_tokenize

wordtokens = word_tokenize(goodRaw)

from nltk.probability import FreqDist

from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

filtered_wordtokens = []

for w in wordtokens:
    if w not in stop_words:
        filtered_wordtokens.append(w)

fdist = FreqDist(filtered_wordtokens)

isorted_fdist = sorted(fdist.items(), key=operator.itemgetter(1))

sorted_fdist = isorted_fdist.reverse()

fObj = open('frequenceDistribution.txt', 'w')
for key in isorted_fdist:
    fObj.write(key[0] + ' '+ str(key[1]) + '\n')

fObj.close()
    
import matplotlib.pyplot as plt
import numpy as np
#fdist.plot(30,cumulative=False)
#
Beispiel #11
0
# #### 2. How about unigram tokens?

# In[7]:


tokens_num = len(corpus_tokens)
print('There are {} unigram tokens present in this corpus.'.format(tokens_num))


# #### 3. Produce a rank-frequency plot (similar to those seen on the Wikipedia page for Zipf's Law) for this corpus.

# In[5]:


fdist = FreqDist(corpus_tokens)


# In[11]:


ranked_dist = fdist.most_common()
freq_log = []
rank_log = []

# Get the logs of frequencies and ranks
for rank, freq in enumerate(ranked_dist):
    # Compute the logs of frq and rank
    log_f = math.log10(freq[1])
    log_r = math.log10(rank + 1)