def _get_features(f_tweets):

    _feature_vector = []
    pos_features_dist = []
    neg_features_dist = []
    neutral_features_dist = []

    for token, label in f_tweets:
        if label == 'positive':
            pos_features_dist.extend(token)
        elif label == 'negative':
            neg_features_dist.extend(token)
        else:
            neutral_features_dist.extend(token)

    pos_features_dist = probability.FreqDist(pos_features_dist)
    for key, value in pos_features_dist.iteritems():
        _feature_vector.append(({key: value}, 'positive'))

    neg_features_dist = probability.FreqDist(neg_features_dist)
    for key, value in neg_features_dist.iteritems():
        _feature_vector.append(({key: value}, 'negative'))

    neutral_features_dist = probability.FreqDist(neutral_features_dist)
    for key, value in neutral_features_dist.iteritems():
        _feature_vector.append(({key: value}, 'neutral'))

    #print "_feature_vector",'\n',_feature_vector,"\n" #[({'car': 1}, 'positive')]
    return _feature_vector
Esempio n. 2
0
 def find_naive_v1(self, min_size):
     frequencies = prob.FreqDist()
     for index in range(len(self.klass_values) - 1):
         frequencies.inc(self.klass_values[index])
         if frequencies[frequencies.max()] >= min_size:
             self.append(index)
             frequencies = prob.FreqDist()
Esempio n. 3
0
 def adjust_for_min_freq(self, min_size):
     prev = -1
     self.sort()
     to_remove,frequencies = [], prob.FreqDist()
     for breakpoint in self.data:
         frequencies.inc(self.klass_values[breakpoint], breakpoint - prev)
         if frequencies[frequencies.max()] < min_size:
             to_remove.append(breakpoint)
         else:
             frequencies = prob.FreqDist()
         prev = breakpoint    
     for item in to_remove:
         self.remove(item)
Esempio n. 4
0
 def savelocalfd(self):
     self.localdist = dict()
     for doc in self.docs:
         localfd = probability.FreqDist()
         for tok in doc.tokens():
             localfd.inc(tok)
         self.localfd[doc.fid] = localfd
Esempio n. 5
0
    def transform(self, query):
        '''
            transform function transfom the query
            to words and its frequency.

            Attributes:
                query (str) : query trasformation. requide

            return:
                dict
        '''

        # step 1 : drop special char
        query = re.sub('[^A-Za-z.]+', ' ', query)

        # Step 2 : tokenize
        query = word_tokenize(query)

        # step 3 : droping char len < 1
        query = [i for i in query if len(i) > 1]

        # step 4 : count prob
        query = probability.FreqDist(query)

        # step 6 : convert for search
        query = dict(query)

        return query
def doTheThing(fileContents):
    #    TOKENIZATION
    tokenizedWords = tokenize.word_tokenize(fileContents)

    #     STOPWORDS
    filteredWords = []
    stop_words = set(corpus.stopwords.words('english'))
    for w in tokenizedWords:
        if w not in stop_words:
            filteredWords.append(w)

    #     FREQUENCY DISTRIBUTION
    freqDist = probability.FreqDist(tokenizedWords)

    #     STEMING
    ps = stem.PorterStemmer()
    stemmedWords = []
    for w in filteredWords:
        stemmedWords.append(ps.stem(w))

    #     LEMMATIZATION
    wnl = stem.WordNetLemmatizer()
    lemmatizedWords = []
    for w in filteredWords:
        lemmatizedWords.append(wnl.lemmatize(w, "v"))
    return [
        tokenizedWords, filteredWords, freqDist, stemmedWords, lemmatizedWords
    ]
Esempio n. 7
0
 def buildFreqMap(self, text):
     freq_dict = probability.FreqDist()
     tokens = self.tokenize(text)
     stop_words_removed = self.remove_stop_words(tokens)
     for word in self.stem_text(stop_words_removed):
         freq_dict.inc(word.lower())
     return freq_dict
Esempio n. 8
0
def train(cmd_args, corpus_files, model):
    """ Trains statistical model. """
    for lang in corpus_files:

        text = udhr2.raw(lang)
        #print("lang:", lang, "; length:", len(text))
        # Replace multiple whitespaces (including ' ', '\n', '\t') with just one ' '
        text = re.sub(r'\s+', ' ', text)

        # Skip empty files, like nku.txt
        if len(text) < 1000:
            #print("skipping pathological file", lang)
            model.deleted_langs.append(lang)
            continue

        model.ngrams[lang] = {}
        model.smoothed[lang] = []

        if cmd_args.cross_valid:
            # Remove the first 100 characters to go to the test set
            model.tests[lang] = text[:cmd_args.test_len]
            text = text[cmd_args.test_len:]

        # Build ngrams for each language in training
        model.ngrams[lang] = char_freqs(text, cmd_args.n_order)

        model.smoothed[lang] = probability.LaplaceProbDist(
            probability.FreqDist(model.ngrams[lang]))
Esempio n. 9
0
 def distribution(self, tokens, laplace=True):
     fd = probability.FreqDist()
     for word in tokens:
        fd.inc(word)
     if laplace:
         return probability.LaplaceProbDist(fd)
     else:
         return probability.MLEProbDist(fd)
Esempio n. 10
0
def count_stems(corpus):
    fd = probability.FreqDist()

    for word in corpus.words():
        w = word.lower()
        if w in stopset: continue
        fd.inc(stemmer.stem(w))

    return fd
Esempio n. 11
0
def count_hypernyms(corpus):
    fd = probability.FreqDist()

    for word in corpus.words():
        w = word.lower()
        if w in stopset: continue

        for syn in wordnet.synsets(w):
            if syn.pos != 'n': continue

            for path in syn.hypernym_paths():
                for hyp in path:
                    fd.inc(hyp.name)

    return fd
Esempio n. 12
0
 def savelocaldist(self, laplace = True, savetokens = False):
     self.localdist = dict()
     
     for doc in self.docs:
         if savetokens:
             doc.terms = []
         localfd = probability.FreqDist()
         for tok in doc.tokens():
             if savetokens:
                 doc.terms.append(tok)
             localfd.inc(tok)
         if localfd.N() > 0:
             if laplace:
                 self.localdist[doc.fid] = probability.LaplaceProbDist(localfd)
             else:
                 self.localdist[doc.fid] = probability.MLEProbDist(localfd)
Esempio n. 13
0
    def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        @param unk: instance of a POS tagger, conforms to TaggerI
        @type  unk:(TaggerI)
        @param Trained: Indication that the POS tagger is trained or not
        @type  Trained: boolean
        @param N: Beam search degree (see above)
        @type  N:(int)
        @param C: Capitalization flag 
        @type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability
      
        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni  = probability.FreqDist()
        self._bi   = probability.ConditionalFreqDist()
        self._tri  = probability.ConditionalFreqDist() 
        self._wd   = probability.ConditionalFreqDist()
        self._eos  = probability.ConditionalFreqDist()
        self._l1   = 0.0
        self._l2   = 0.0
        self._l3   = 0.0
        self._N    = N
        self._C    = C
        self._T    = Trained
      
        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0
def doTheThing(fileContents, mode):
    result = []
    #    TOKENIZATION
    if mode >= 0:
        tokenizedWords = tokenize.word_tokenize(fileContents)
        print('Tokenization...')
        result.append(tokenizedWords)

    #     STOPWORDS
    if mode >= 1:
        print('Stopwords...')
        filteredWords=[]
        stop_words = set(get_stop_words('polish'))
        for w in tokenizedWords:
            if w not in stop_words:
                filteredWords.append(w)
        result.append(filteredWords)

    #     FREQUENCY DISTRIBUTION
    if mode >= 2:
        print('FrequencyDistribution...')
        freqDist = probability.FreqDist(filteredWords)
        result.append( freqDist )

    #     STEMING
    if mode >= 3:
        print('Stemming...')
        ps = stem.PorterStemmer()
        stemmedWords = []
        for w in filteredWords:
            stemmedWords.append(ps.stem(w))
        result.append(stemmedWords)

    #     LEMMATIZATION
    if mode >= 4:
        print('Lemmanization...')
        wnl = stem.WordNetLemmatizer()
        lemmatizedWords = []
        for w in filteredWords:
            lemmatizedWords.append(wnl.lemmatize(w, "v"))
        result.append(lemmatizedWords)
    return result
Esempio n. 15
0
 def globaldist(self, laplace=True):
     '''
     return a global probabiliyt distribution for a set of document.
     Memory problem if the set of document is too large.
     Use laplace smooting by default.
     Creates  a storage gdist which holds the global dist
     Must clear this variable after use to free the memory
     '''
     fd = probability.FreqDist()
     for doc in self.docs:
         tokens = None
         if doc.terms is None:
             tokens = doc.tokens()
         else:
             tokens = doc.terms
         for tok in tokens:
             fd.inc(tok)
     if laplace:
         self.gdist = probability.LaplaceProbDist(fd)
     else:
         self.gdist = probability.MLEProbDist(fd)
     return self.gdist
Esempio n. 16
0
import nltk.probability as p
import nltk.tokenize as tk
from nltk.corpus import stopwords

file_path = '../../DataSets/Test/DE_EN_(tatoeba)_test.txt'

text = ''
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        text += line.split('\t')[0].lower().replace('.', '').replace('?',
                                                                     '') + ' '

blob = tk.casual_tokenize(text, strip_handles=True)

better_blob = []
stop_words = set(stopwords.words('english'))

for word in blob:
    if not (len(word) <= 3 or word == 'mary' or word == "tom's"
            or word == "mary's"):
        better_blob.append(word)

filtered_blob = list(filter(lambda w: not w in stop_words, better_blob))

heu = p.FreqDist(filtered_blob).most_common(100)

for i in range(len(heu)):
    print(heu[i])
Esempio n. 17
0
def class_distribution(base_path):
    training = format.C45_FORMAT.get_training_instances(base_path)
    freq_dist = probability.FreqDist()
    for each in training:
        freq_dist.inc(each.klass_value)
    return freq_dist
def word_counts(words):
    return dict(probability.FreqDist((w, 1) for w in words))
Esempio n. 19
0
import nltk
import nltk.probability as pro

br = nltk.corpus.brown
freq = pro.FreqDist([ w for cat in br.categories() for w in br.words(categories=cat)])

word3 = [w for w in freq.keys() if freq[w]>3]
print('长度大于3的词共有',len(word3),'个')
Esempio n. 20
0
# -*- coding: utf-8 -*-
# tmeEvijv.py

import nltk, re, operator
from nltk.book import *
from nltk import probability
from nltk.corpus import udhr

######################## Exercice 1 #################################

text5Freq = probability.FreqDist(text5)
sortedList = sorted(text5Freq.items(),
                    key=operator.itemgetter(1),
                    reverse=True)
mots4lettres = [w[0] for w in sortedList if len(w[0]) == 4]
#print mots4lettres

######################## Exercice 2 #################################

wordsHat = []
wordsZ = []
wordsPT = []
for i in set(text6):
    reHat = re.search('.*(?i)hat$', i)
    if reHat != None:
        wordsHat.append(reHat.group())

    reZ = re.search('.*(?i)z.*', i)
    if reZ != None:
        wordsZ.append(reZ.group())
            comment.replace(" ", "")
            comment = (' ').join(comment.split())
            texts.append(comment)

        texts = texts[1:]
    texts = texts + train_y_sentence

    texts_sentences = []
    pattern = re.compile('([a-z \,]+\.)')

    for x in texts:
        results = pattern.findall(comment)
        for j in results:
            texts_sentences.append(j)

    starting = pb.FreqDist()
    transitional = pb.ConditionalFreqDist()
    emissional = pb.ConditionalFreqDist()
    pi = pb.FreqDist()

    for row in test_y_sentence:
        pi[row[0]] += 1

    for row in texts_sentences:
        lasts = None
        for ch in list(row):
            if (lasts is not None):
                transitional[lasts][ch] += 1
                lasts = ch

    for row in train_data:
Esempio n. 22
0
def majority_klass_vote(instances):
    fd = prob.FreqDist()
    for each in instances:
        fd.inc(each.klass_value)
    return fd.max()
Esempio n. 23
0
 def empty_freq_dists(self):
     return dict([(value, prob.FreqDist()) for value in self.values])
Esempio n. 24
0
 def fd(self, tokens):
     fd = probability.FreqDist()
     for term in tokens:
         fd.inc(term)
     return fd
Esempio n. 25
0
def entropy_of_key_counts(dictionary):
    freq_dist = prob.FreqDist()
    klasses = dictionary.keys()
    for klass in klasses:
        freq_dist.inc(klass, dictionary[klass])
    return entropy_of_freq_dist(freq_dist)
Esempio n. 26
0
def entropy(values):
    freq_dist = prob.FreqDist()
    for value in values:
        freq_dist.inc(value)
    return entropy_of_freq_dist(freq_dist)
Esempio n. 27
0
 def class_freq_dist(self):
     class_freq_dist = prob.FreqDist()
     for instance in self.data:
         class_freq_dist.inc(instance.klass_value)
     return class_freq_dist