Esempio n. 1
0
    def simhash(raw_text):
        """Compute the simhash value for a string."""
        fdist = FreqDist()
        for word in regexp_tokenize(raw_text, pattern=r'\w+([.,]\w+)*|\S+'):
            fdist.inc(word.lower())

        v = [0] * 128

        for word in fdist:
            projection = bitarray()
            projection.fromstring(hashlib.md5(word).digest())
            #print "\tw:%s, %d" % (word, fdist[word])
            #print "\t\t 128 bit hash: " + str(b)

            for i in xrange(128):
                if projection[i]:
                    v[i] += fdist.get(word)
                else:
                    v[i] -= fdist.get(word)


        hash_val = bitarray(128)
        hash_val.setall(False)

        for i in xrange(128):
            if v[i] > 0:
                hash_val[i] = True
        return hash_val
Esempio n. 2
0
    def add_documents(self, document_entries):
        """
        Add new documents to be indexed.
        :param document_entries: a set of objects from the class DocumentEntry
        """
        if document_entries:
            forward = {
                key: {(document_entries[key][0])}
                for key in document_entries.keys()
            }
            for key in document_entries.keys():
                freq_dist = FreqDist(document_entries[key][1])
                for token in document_entries[key][1]:
                    if len(self.inverted_index) is 0 \
                            or self.__normalize(token) not in self.inverted_index.keys():
                        self.inverted_index[self.__normalize(token)] \
                            .add((freq_dist.get(token), key, freq_dist.freq(token)))
                    else:
                        if not freq_dist.get(token) is None:
                            self.inverted_index[self.__normalize(token)] \
                                .add((freq_dist.get(token), key, freq_dist.freq(token)))

            self.forward_index.update(forward)
            self.tf_idf()
            self.dal.save(self.forward_index, 'forward_index.csv')
            self.dal.save(self.inverted_index, 'inverted_index.csv')
Esempio n. 3
0
 def calc_bigram_probability(self, string, bigrams):
     bigram_dist = FreqDist(
         bigrams)  #Create Bigram distribution using FreqDist() function
     unigrams, string = generate_ngram(string, 1)
     self.unigram_dist = FreqDist(
         unigrams)  #Create Unigram distribution using FreqDist() function
     self.vocab_size = len(set(
         unigrams))  #vocab size is the toal distinct characters in dataset
     for bigram in set(bigrams):
         bigram_count = bigram_dist.get(bigram)  #get count of bigram
         unigram_count = self.unigram_dist.get(
             bigram[:-1])  #get count of unigram
         probability = Fraction(
             bigram_count + 1, unigram_count +
             self.vocab_size)  #Calculate probability with add-one smoothing
         temp = {
             bigram[-1]: probability
         }  #Storing in temp dictionary - last char in bigram and probability
         if bigram[0] in self.bigram_probabilities:
             self.bigram_probabilities[bigram[0]].update(
                 temp
             )  #update main dictionary with temp- {bigram[0]: {bigram[1]:probability}}
         else:
             self.bigram_probabilities.setdefault(
                 bigram[0], {}
             )  #If encountered a character first time, add it to main dictionary as a dict object
             self.bigram_probabilities[bigram[0]].update(temp)
Esempio n. 4
0
def replace_unknown(tokens, cut_off):
    """ Replace tokens which appear (> cutoff) times in the corpus with <UNK>.

        :param tokens: (list of str) the tokens comprising the corpus.
        :param cut_off: (int) the bound of cutting the token.
        :return: The same list of tokens with each singleton replaced by <UNK>.

        [Hint]
        use nltk.FreqDist when you build a vocab.

        [Example]
        tokens = ['<s>', 'don't', 'put', 'off', 'until', 'tomorrow', 'what', 'you', 'can', 'do', today']
        Let's suppose counts for each words in tokens
        'don't' - 1, 'put' - 1, 'off' - 0, 'until' - 0, 'tomorrow' - 1, 'what' - 1, 'you' - 1, 'can' - 1
        'do' - 1, 'today' - 1
        and cutoff = 0
        Then output should be
        ['<s>', 'don't', 'put', <UNK>, <UNK>, 'tomorrow', 'what', 'you', 'can', 'do', 'today']
    """
    from nltk.probability import FreqDist
    fdist = FreqDist(tokens)
    #for word in tokens:
    #    fdist[word.lower()] += 1
    tokens = [
        UNK
        if word != SOS and word != EOS and fdist.get(word) <= cut_off else word
        for word in tokens
    ]
    print(tokens)
    return tokens
def get_term_freq_dict(data):
    # Change it to lower case
    lower_data = data.lower()
    
    # Tokenize it
    tokens = word_tokenize(lower_data)
    freq_dist = FreqDist(tokens)
    
    # Lemmatize it
    word_freq = {}
    
    for term in freq_dist.keys():
        lemmatize_term = wordnet.lemmatize(term)
        val = freq_dist.get(term)
        
        # If it exist in word_freq, add value
        if lemmatize_term in word_freq:
            freq = word_freq[lemmatize_term]
            word_freq[lemmatize_term] = freq + val
            
        # Else, assign value
        else:
            word_freq[lemmatize_term] = val
    
    
    return word_freq
Esempio n. 6
0
    def FREQ(self, threshold):
        tagged = []
        nouns = []
        noun_phrases = []
        sorted_fdist = []
        result = []
        for s in self.tokens:
            print(s)
            temp = nltk.pos_tag(s)
            print(temp)
            tagged.append(temp)
            nouns = nouns + list(
                filter(lambda x: x[1].__contains__("NN"), temp))
            noun_phrases = noun_phrases + self.get_noun_phrases(s)

            fdist = FreqDist(word.lower() for word in s)
            for x in fdist.keys():
                sorted_fdist.append((fdist.get(x), x))
        sorted_fdist.sort()

        nouns_r = set([x[0] for x in nouns])
        noun_phrases = set(noun_phrases)
        print("=================================")
        print("NOUNS:", nouns)
        print("NOUNSPHRA:", noun_phrases)
        print("FREQ:", sorted_fdist)
        t = list(
            filter(lambda x: x[0] >= threshold and x[1] in nouns_r,
                   sorted_fdist))
        print(t)
        t_r = [x[1] for x in t]
        print("T_R", t_r)
        result = t_r + list(noun_phrases)
        print("RESULT", set(result))
        return set(result)
Esempio n. 7
0
class UnigramModel:
    def __init__(self, text):
        self.unigrams, self.string = generate_ngram(
            text, 1
        )  #Initialize list of unigrams by calling generate ngram method with n=1
        self.unigram_probabilities = {
        }  #Dictionary to store unigram and corresponding probability
        self.vocab_size = 0
        self.unigram_dist = FreqDist()

    def calc_unigram_probability(self, unigrams):
        self.unigram_dist = FreqDist(
            unigrams)  #Create Unigram distribution using FreqDist() function
        total_count = len(unigrams)  #Total count of unigrams in training set
        self.vocab_size = len(
            set(unigrams))  #Count of distinct unigrams (characters)
        for unigram in set(unigrams):
            unigram_count = self.unigram_dist.get(
                unigram)  #get count of occurrence for particular unigram

            #Add one to numerator and V to denominator for Add-one smoothing
            self.unigram_probabilities.update({
                unigram:
                Fraction(unigram_count + 1, total_count + self.vocab_size)
            })  #Create dictionary of unigram and its corresponding probability
Esempio n. 8
0
def replace_unknown(tokens, cut_off):
    """ Replace tokens which appear (<= cutoff) times in the corpus with <UNK>.

        :param tokens: (list of str) the tokens comprising the corpus.
        :param cut_off: (int) the bound of cutting the token.
        :return: The same list of tokens with each singleton replaced by <UNK>.

        [Hint]
        use nltk.FreqDist when you build a vocab.

        [Example]
        tokens = ['<s>', 'I', 'love', 'cake', 'I', 'like', 'cake', '</s>']
        Then, here are the counts for each words in tokens:
        'I': 2, 'love': 1, 'cake': 2, 'like': 1
        and cutoff = 1
        Then output should be
        ['<s>', 'I', UNK, 'cake', 'I', UNK, 'cake', '</s>']
    """
    from nltk.probability import FreqDist
    fdist = FreqDist(tokens)
    #for word in tokens:
    #    fdist[word.lower()] += 1
    tokens = [
        UNK
        if word != SOS and word != EOS and fdist.get(word) <= cut_off else word
        for word in tokens
    ]
    return tokens
Esempio n. 9
0
def get_term_freq_dict(data):
    # Change it to lower case
    lower_data = data.lower()

    # Tokenize it
    tokens = word_tokenize(lower_data)

    # tokens = re.sub(r"[']+", r"",tokens)
    # tokens = [i.replace("[\']+","") for i in tokens]
    tokens2 = []
    for i in tokens:
        i = re.sub(r'[-]+', ' ', i)
        i = re.sub(r"[']+", '', i)
        i = re.sub(r'[com]$', '', i)
        # re.sub('')
        i = re.sub(r'[0-9]+', '', i)
        i = re.sub(r'\*+', '', i)
        i = re.sub(r'[\n]+', '', i)
        i = re.sub(r'[_]+', ' ', i)
        i = re.sub(r'[\d.*?]+', '', i)
        # if(i[0] == "\'"):
        # i= i[1:]
        a = word_tokenize(i)
        for x in a:
            tokens2.append(x)
    # print tokens
    print tokens2
    tokens = tokens2

    # for i in tokens:
    # i.replace(r"[']+",r"")
    # print tokens
    freq_dist = FreqDist(tokens)

    # Lemmatize it
    word_freq = {}

    for term in freq_dist.keys():
        lemmatize_term = wordnet.lemmatize(term)
        val = freq_dist.get(term)

        # If it exist in word_freq, add value
        if lemmatize_term in word_freq:
            freq = word_freq[lemmatize_term]
            word_freq[lemmatize_term] = freq + val

        # Else, assign value
        else:
            word_freq[lemmatize_term] = val

    word_freq = remove_stop_words(word_freq)
    print word_freq
    # word_freq = remove_punctuation(word_freq)
    # print word_freq
    # word_freq = word_freq.map(lambda x: str_stemmer_wo_parser(x))

    return word_freq
Esempio n. 10
0
class TrigramModel:
    def __init__(self, text):
        self.trigrams, self.string = generate_ngram(
            text,
            3)  #Initialize list of trigrams by calling generate ngram method
        #self.string will be used to generate bigram and unigram distributions for later use
        self.trigram_probabilities = {
        }  #Dictionary to store trigram and corresponding probability
        self.vocab_size = 0
        self.bigram_dist = FreqDist()

    def calc_trigram_probability(self, string, trigrams):
        trigram_dist = FreqDist(
            trigrams)  #Create Trigram distribution using FreqDist() function
        bigrams, string = generate_ngram(string, 2)
        self.bigram_dist = FreqDist(
            bigrams)  #Create Bigram distribution using FreqDist() function
        unigrams, string = generate_ngram(string, 1)
        unigram_dist = FreqDist(
            unigrams
        )  #Create Unigram distribution using FreqDist() function to calculate vocab size of dataset.
        self.vocab_size = len(set(unigrams))

        for trigram in set(trigrams):
            trigram_count = trigram_dist.get(
                trigram)  #get count of occurrence of trigram
            bigram_count = self.bigram_dist.get(
                trigram[:-1])  #get count of occurrence of preceding bigram
            probability = Fraction(
                trigram_count + 1, bigram_count +
                self.vocab_size)  #Calculate probability with add-one smoothing
            temp = {trigram[2]: probability}  #Store in temp ditionary
            if trigram[0] in self.trigram_probabilities:
                if trigram[1] in self.trigram_probabilities.get(trigram[0]):
                    self.trigram_probabilities.get(
                        trigram[0])[trigram[1]].update(
                            temp)  #Storing in 3 level dictionary as -
                else:  #{trigram[0]:{trigram[1]:{trigram[2]:probability}}}
                    self.trigram_probabilities.get(trigram[0]).setdefault(
                        trigram[1], {})
                    self.trigram_probabilities.get(
                        trigram[0])[trigram[1]].update(temp)
            else:
                self.trigram_probabilities.setdefault(trigram[0], {})
                self.trigram_probabilities.get(trigram[0]).setdefault(
                    trigram[1], {})
                self.trigram_probabilities.get(
                    trigram[0])[trigram[1]].update(temp)
Esempio n. 11
0
def featureList(corpus):
    featList = []
    for trFile in corpus.fileids():
        listItem = [0]*noFeat
        fileFreqDist = FreqDist()
        fileFreqDist = nltk.FreqDist(corpus.words(trFile))
        
        i =0
        for key in trainKeys:
            if fileFreqDist.has_key(key):
                listItem[i] = fileFreqDist.get(key)
            i=i+1
            
        featList.append(listItem)
        
    return featList
 def featureList(corpus):
     featList = []
     for post in corpus:
         listItem = [0]*noFeat
         fileFreqDist = FreqDist()
         fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))
         
         i =0
         for key in trainKeys:
             if fileFreqDist.has_key(key):
                 listItem[i] = fileFreqDist.get(key)
             i=i+1
             
         featList.append(listItem)
         
     return featList
Esempio n. 13
0
    def featureList(corpus):
        featList = []
        for post in corpus:
            listItem = [0] * noFeat
            fileFreqDist = FreqDist()
            fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))

            i = 0
            for key in trainKeys:
                if fileFreqDist.has_key(key):
                    listItem[i] = fileFreqDist.get(key)
                i = i + 1

            featList.append(listItem)

        return featList
def createFeatures(sentVect, ordList):
    
    noFeat = len(ordList)
    
    featList = []
    for post in sentVect:
        listItem = [0]*noFeat
        fileFreqDist = FreqDist()
        fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))
            
        i =0
        for key in ordList:
            if fileFreqDist.has_key(key):
                listItem[i] = fileFreqDist.get(key)
            i=i+1
                
        featList.append(listItem)
            
    return featList
Esempio n. 15
0
    def get_frequency(self, flag):
        fdist = {}
        sentences = []
        if flag == "text":
            sentences = self.tokens_text
        else:
            sentences = self.tokens_corpus

        for s in sentences:
            for w in s:
                fdist[w.lower()] = 0

        for s in sentences:
            t = FreqDist(word.lower() for word in s)
            #             print("FRECUENCIAS")
            for x in t.keys():
                #                 print("{}->{}".format(x,t.get(x)))
                fdist[x] += t.get(x)
        return fdist
Esempio n. 16
0
def createFeatures(sentVect, ordList):

    noFeat = len(ordList)

    featList = []
    for post in sentVect:
        listItem = [0] * noFeat
        fileFreqDist = FreqDist()
        fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))

        i = 0
        for key in ordList:
            if fileFreqDist.has_key(key):
                listItem[i] = fileFreqDist.get(key)
            i = i + 1

        featList.append(listItem)

    return featList
def createProbDist(readerWordlist,writerUniqueWordlist):      
        
    ### create a dictionary to store the frequency of each term
    prob_dist = []
    
    ### using nltk calcuate frequency of each word
    unigramWordList = FreqDist(readerWordlist)
    datalen = len(readerWordlist) ### total words in the that document
    #print len(unigramWordList)
    #print datalen
    
    for word in writerUniqueWordlist:
        if word in unigramWordList:
            #print word
            #print unigramWordList.get(word)
            #print unigramWordList.get(word)/float(datalen)
            prob_dist.append(unigramWordList.get(word)/float(datalen))
        else:
            prob_dist.append(0)
            
    #print prob_dist
    return prob_dist
Esempio n. 18
0
class BigramModel:
    def __init__(self, text):
        self.bigrams, self.string = generate_ngram(
            text, 2
        )  #Initialize list of bigrams by calling generate ngram method with n = 2
        self.bigram_probabilities = {
        }  #Dictionary to store bigram and corresponding probability
        self.vocab_size = 0
        self.unigram_dist = FreqDist()

    def calc_bigram_probability(self, string, bigrams):
        bigram_dist = FreqDist(
            bigrams)  #Create Bigram distribution using FreqDist() function
        unigrams, string = generate_ngram(string, 1)
        self.unigram_dist = FreqDist(
            unigrams)  #Create Unigram distribution using FreqDist() function
        self.vocab_size = len(set(
            unigrams))  #vocab size is the toal distinct characters in dataset
        for bigram in set(bigrams):
            bigram_count = bigram_dist.get(bigram)  #get count of bigram
            unigram_count = self.unigram_dist.get(
                bigram[:-1])  #get count of unigram
            probability = Fraction(
                bigram_count + 1, unigram_count +
                self.vocab_size)  #Calculate probability with add-one smoothing
            temp = {
                bigram[-1]: probability
            }  #Storing in temp dictionary - last char in bigram and probability
            if bigram[0] in self.bigram_probabilities:
                self.bigram_probabilities[bigram[0]].update(
                    temp
                )  #update main dictionary with temp- {bigram[0]: {bigram[1]:probability}}
            else:
                self.bigram_probabilities.setdefault(
                    bigram[0], {}
                )  #If encountered a character first time, add it to main dictionary as a dict object
                self.bigram_probabilities[bigram[0]].update(temp)
Esempio n. 19
0
    def calc_trigram_probability(self, string, trigrams):
        trigram_dist = FreqDist(
            trigrams)  #Create Trigram distribution using FreqDist() function
        bigrams, string = generate_ngram(string, 2)
        self.bigram_dist = FreqDist(
            bigrams)  #Create Bigram distribution using FreqDist() function
        unigrams, string = generate_ngram(string, 1)
        unigram_dist = FreqDist(
            unigrams
        )  #Create Unigram distribution using FreqDist() function to calculate vocab size of dataset.
        self.vocab_size = len(set(unigrams))

        for trigram in set(trigrams):
            trigram_count = trigram_dist.get(
                trigram)  #get count of occurrence of trigram
            bigram_count = self.bigram_dist.get(
                trigram[:-1])  #get count of occurrence of preceding bigram
            probability = Fraction(
                trigram_count + 1, bigram_count +
                self.vocab_size)  #Calculate probability with add-one smoothing
            temp = {trigram[2]: probability}  #Store in temp ditionary
            if trigram[0] in self.trigram_probabilities:
                if trigram[1] in self.trigram_probabilities.get(trigram[0]):
                    self.trigram_probabilities.get(
                        trigram[0])[trigram[1]].update(
                            temp)  #Storing in 3 level dictionary as -
                else:  #{trigram[0]:{trigram[1]:{trigram[2]:probability}}}
                    self.trigram_probabilities.get(trigram[0]).setdefault(
                        trigram[1], {})
                    self.trigram_probabilities.get(
                        trigram[0])[trigram[1]].update(temp)
            else:
                self.trigram_probabilities.setdefault(trigram[0], {})
                self.trigram_probabilities.get(trigram[0]).setdefault(
                    trigram[1], {})
                self.trigram_probabilities.get(
                    trigram[0])[trigram[1]].update(temp)
Esempio n. 20
0
print("2.", len(cess_esp.words()))
# 3
print("3.", len(cess_esp.sents()))
# 4
from nltk.probability import FreqDist

first_file = cess_esp.fileids()[0]
cess_freq0 = FreqDist(cess_esp.words(first_file))
print("4.", cess_freq0.most_common(20))
# 5
print("5.", [w for w, k in cess_freq0.most_common()])
# 6
print("6.", [w for w, k in cess_freq0.items() if len(w) > 7 and k > 2])
# 7
print("7.", [k for w, k in cess_freq0.most_common()])
print("7b. Freq de aparición de la preposición a", cess_freq0.get("a", 0))
# 8
print("8. No de palabras que aparecen una sola vez:",
      len([w for w, k in cess_freq0.items() if k == 1]))
# 9
print("9. La palabra más frecuente es", cess_freq0.max())
# 10
from nltk.corpus import PlaintextCorpusReader

mycorpus = PlaintextCorpusReader("../res/", ".*")
# 11
print("11.")
for doc in mycorpus.fileids():
    print(doc, len(mycorpus.words(doc)), len(set(mycorpus.words(doc))),
          len(mycorpus.sents(doc)))
Esempio n. 21
0
print("% 5.2f" % durata + " milisecunde ")
print("------------------------------------------------------------")

start = timeit.timeit()
medie = sum(len(cuv) for cuv in vocabT2) / len(vocabT2)
print("Lungimea medie a cuvintelor este:")
print(medie)
end = timeit.timeit()
durata = 1000 * abs(end - start)
print("Procesarea a durat ", end=" ")
print("% 5.2f" % durata + " milisecunde ")
print("------------------------------------------------------------")

start = timeit.timeit()
fdist = FreqDist(text2)
apare1 = [cv for cv in fdist.keys() if fdist.get(cv) == 1]
print("Cuvintele care apar doar o dată:")
print(apare1)
end = timeit.timeit()
durata = 1000 * abs(end - start)
print("Procesarea a durat ", end=" ")
print("% 5.2f" % durata + " milisecunde ")
print("------------------------------------------------------------")

start = timeit.timeit()

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(text2)
print("Primele 10 colocații (pentru bigrame) sînt:")
print(finder.nbest(bigram_measures.pmi, 10))
end = timeit.timeit()
Esempio n. 22
0
    for (token, is_gene) in goldstandard_words:
        dist = gene_substrings if is_gene else notgene_substrings
        dist.update(substrings(token))

    difference = defaultdict(float)
    size1 = float(gene_substrings.N())
    size2 = float(notgene_substrings.N())

    # difference ist eine Hashtabelle, wobei die Schlüssel von
    # Substrings und die Werte Differenzen in ihrer Häufigkeit sind.
    # Positiv bedeutet: häufig in Gennamen; negativ bedeutet: selten
    # in Gennamen.

    for substr in (gene_substrings + notgene_substrings).iterkeys():
        v1 = gene_substrings.get(substr) or 0
        v2 = notgene_substrings.get(substr) or 0

        difference[substr] = v1 / size1 - v2 / size2

    r = xrange(-10, 10 + 1)
    A = array([array(r), zeros_like(r)])
    A.dtype = dtype('float32')

    # for (i, e) in enumerate(xrange(-10, 10 + 1)):
    #     if e != 0:
    #         epsilon = 0.2 / e
    #         print "epsilon = ", epsilon
    #         klassi = OrClassifier([
    #             DictionaryClassifier(given_genes, stopwords),
    #             DifferenceClassifier(difference),
                token_collection.append(token)
                root = get_root(token)
                if root != '' and root != None:
                    lemmatised_sentences[len(lemmatised_sentences) -
                                         1] = lemmatised_sentences[
                                             len(lemmatised_sentences) -
                                             1].replace(token, root)
                    lemmatised_tokens.append(root)
                else:
                    lemmatised_tokens.append(token)

if len(token_collection) != 0:
    fdist = FreqDist(token_collection)
    for key in list(fdist.keys()):
        print('word: ', key)
        print(fdist.get(key))
        out_file.write(key + ',' + str(fdist.get(key)) + '\n')

if len(lemmatised_tokens) != 0:
    fdist = FreqDist(lemmatised_tokens)
    for key in list(fdist.keys()):
        print('word: ', key)
        print(fdist.get(key))
        out_lemma_file.write(key + ',' + str(fdist.get(key)) + '\n')

#td-idf - lemmas?
#instantiate CountVectorizer()
cv_words = CountVectorizer()
cv_lemmas = CountVectorizer()
# this steps generates word counts for the words in your docs
word_count_vector = cv_words.fit_transform(sentences)

text = nltk.Text(tokens)
dist = FreqDist(tokens)
top_ten = dist.most_common(20)

top_ten = pd.DataFrame(dist.most_common(10), columns = ['Word', 'Count'])
top_ten.plot.bar(x='Word', y='Count')


a = nltk.pos_tag([a[0] for a in dist.most_common(1000)])

word = []
for k, pos in a:
    if pos not in  ['NN', 'IN', 'CD', 'JJ', 'JJS', 'VBN', 'VB', 'VBD']:
        word.append((k, dist.get(k)))
    
c = 0
for k,v in dist.most_common(20):
    
    print(k, "--->", v)
    c += 1
    if c == 5:
        break






all_chunks = []
Esempio n. 25
0
from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist

conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos'))  # getting a train corpus from file
states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.')  # list of 12 POS tags
sentslen = len(conllreader.tagged_sents())  # getting number of sentences

tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words())   # getting frequence of (word,tag)

firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents())  # getting frequence of first tags
A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems())))
A0jLap = LaplaceProbDist(firsttagfdist)
A0jGT = SimpleGoodTuringProbDist(firsttagfdist)
A0jMLE = MLEProbDist(firsttagfdist)

TagPair = []
words = conllreader.tagged_words()
for i in range(0, len(words)-1):
    TagPair.append((words[i][1], words[i+1][1]))

TagPairfdist = FreqDist(TagPair)
Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems())))
AijLap = LaplaceProbDist(TagPairfdist)
AijGT = SimpleGoodTuringProbDist(TagPairfdist)
AijMLE = MLEProbDist(TagPairfdist)

TagWordfdist = FreqDist(conllreader.tagged_words())
Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems())))
BiwLap = LaplaceProbDist(TagWordfdist)
BiwGT = SimpleGoodTuringProbDist(TagWordfdist)
BiwMLE = MLEProbDist(TagWordfdist)
def main():
    parser = argparse.ArgumentParser(description='Categorize (nouns, verbs, etc.) all words in a text')

    parser.add_argument('file', type=argparse.FileType('rU'),
                        help='text file to categorize')

    parser.add_argument('--include-stopwords', action='store_true', default=DEFAULT_STOPWORDS,
                        help='include very common words in results (default: {})'
                        .format(DEFAULT_STOPWORDS))

    parser.add_argument('--sort', type=str, default=DEFAULT_SORT,
                        choices=[SORT_ALPHA, SORT_OCCURRENCES, SORT_LENGTH, SORT_CLASS],
                        help='''how to sort output; by word alphabetically,
                        number of occurrences, word length, or word class (default: {})'''
                        .format(DEFAULT_SORT))

    args = parser.parse_args()

    if not args.file:
        sys.exit('You must specify a text file to categorize')

    # Download required nltk data if needed.
    nltk.download('maxent_treebank_pos_tagger')
    if not args.include_stopwords:
        nltk.download('stopwords')

    # Create tokenized lists of words from the text.
    text = args.file.read()
    text = text.lower()
    tokens = nltk.word_tokenize(text)

    # Only keep words consisting entirely of letters (i.e. remove punctuation,
    # numbers, etc.)
    tokens = [word for word in tokens if word.isalpha()]

    # Save number of occurrences for each word for later reference, which could
    # as well be through a collections.Counter but FreqDist will cover more
    # cases if expanded on later.
    fdist = FreqDist(tokens)

    # Remove stopwords, i.e. very common ones.
    tokens = set(tokens)
    if not args.include_stopwords:
        tokens -= STOPWORDS

    print('Found {} unique words'.format(len(tokens)))

    # Make a list of (word, occurrences, length, word_class) tuples for final
    # output. `word` is the word itself, `occurrences` the number of times it
    # occurs in its source text, `length` the length of the word, and
    # `word_class` what type of word it is, e.g. noun, verb, etc.
    matches = [(word, fdist.get(word), len(word), word_class(word)) for word in tokens]
    headers = ['word', 'occurrences', 'length', 'word_class']

    # Sort output accordingly.
    if args.sort == SORT_ALPHA:
        matches = sorted(matches, key=itemgetter(0))
    elif args.sort == SORT_OCCURRENCES:
        matches = sorted(matches, key=itemgetter(1, 0), reverse=True)
    elif args.sort == SORT_LENGTH:
        matches = sorted(matches, key=itemgetter(2, 0), reverse=True)
    elif args.sort == SORT_CLASS:
        matches = sorted(matches, key=itemgetter(3, 0))

    # Finally, print results in a nice table.
    print(tabulate(matches, headers=headers))
Esempio n. 27
0
    for (token, is_gene) in goldstandard_words:
        dist = gene_substrings if is_gene else notgene_substrings
        dist.update(substrings(token))

    difference = defaultdict(float)
    size1 = float(gene_substrings.N())
    size2 = float(notgene_substrings.N())

    # difference ist eine Hashtabelle, wobei die Schlüssel von
    # Substrings und die Werte Differenzen in ihrer Häufigkeit sind.
    # Positiv bedeutet: häufig in Gennamen; negativ bedeutet: selten
    # in Gennamen.

    for substr in (gene_substrings + notgene_substrings).iterkeys():
        v1 = gene_substrings.get(substr) or 0
        v2 = notgene_substrings.get(substr) or 0

        difference[substr] = v1 / size1 - v2 / size2

    r = xrange(-10, 10 + 1)
    A = array([array(r), zeros_like(r)])
    A.dtype = dtype('float32')

    # for (i, e) in enumerate(xrange(-10, 10 + 1)):
    #     if e != 0:
    #         epsilon = 0.2 / e
    #         print "epsilon = ", epsilon
    #         klassi = OrClassifier([
    #             DictionaryClassifier(given_genes, stopwords),
    #             DifferenceClassifier(difference),
Esempio n. 28
0
fdist_wla = FreqDist(would_like_abl_words)
fdist_wla.plot(30,cumulative=False)
plt.show()
plt.tight_layout()

#Plot would like to be able to gram
fdist_wlag = FreqDist(would_like_abl_gram)
fdist_wlag.plot(30,cumulative=False)
plt.show()
plt.tight_layout()
'''

#Remove anything with less than frequency of 5 or top 5 listed words also
escalation_comment_high_freq = []
high_freq_words = [
    key for key in fdist.keys() if fdist.get(key) > 4
    and key not in [item[0] for item in list(fdist.most_common(5))]
]
for s in escalation_comment_filtered:
    comment = []
    for w in s:
        if w in high_freq_words:
            comment.append(w)
        escalation_comment_high_freq.append(comment)
escalation_comment_filtered = escalation_comment_high_freq

###############################################Topic Modell1ing ##############################

# Create Dictionary
id2word = corpora.Dictionary(escalation_comment_filtered)
# Create Corpus
Esempio n. 29
0
 def frequency(self, word):
     text1 = self.txt
     freq = FreqDist(text1)
     return freq.get(word, 0)
def main():
    parser = argparse.ArgumentParser(
        description="Categorize (nouns, verbs, etc.) all words in a text")

    parser.add_argument("file",
                        type=argparse.FileType("rU"),
                        help="text file to categorize")

    parser.add_argument(
        "--include-stopwords",
        action="store_true",
        default=DEFAULT_STOPWORDS,
        help="include very common words in results (default: {})".format(
            DEFAULT_STOPWORDS),
    )

    parser.add_argument(
        "--sort",
        type=str,
        default=DEFAULT_SORT,
        choices=[SORT_ALPHA, SORT_OCCURRENCES, SORT_LENGTH, SORT_CLASS],
        help="""how to sort output; by word alphabetically,
                        number of occurrences, word length, or word class (default: {})"""
        .format(DEFAULT_SORT),
    )

    args = parser.parse_args()

    if not args.file:
        sys.exit("You must specify a text file to categorize")

    # Download required nltk data if needed.
    nltk.download("maxent_treebank_pos_tagger")
    if not args.include_stopwords:
        nltk.download("stopwords")

    # Create tokenized lists of words from the text.
    text = args.file.read()
    text = text.lower()
    tokens = nltk.word_tokenize(text)

    # Only keep words consisting entirely of letters (i.e. remove punctuation,
    # numbers, etc.)
    tokens = [word for word in tokens if word.isalpha()]

    # Save number of occurrences for each word for later reference, which could
    # as well be through a collections.Counter but FreqDist will cover more
    # cases if expanded on later.
    fdist = FreqDist(tokens)

    # Remove stopwords, i.e. very common ones.
    tokens = set(tokens)
    if not args.include_stopwords:
        tokens -= STOPWORDS

    print("Found {} unique words".format(len(tokens)))

    # Make a list of (word, occurrences, length, word_class) tuples for final
    # output. `word` is the word itself, `occurrences` the number of times it
    # occurs in its source text, `length` the length of the word, and
    # `word_class` what type of word it is, e.g. noun, verb, etc.
    matches = [(word, fdist.get(word), len(word), word_class(word))
               for word in tokens]
    headers = ["word", "occurrences", "length", "word_class"]

    # Sort output accordingly.
    if args.sort == SORT_ALPHA:
        matches = sorted(matches, key=itemgetter(0))
    elif args.sort == SORT_OCCURRENCES:
        matches = sorted(matches, key=itemgetter(1, 0), reverse=True)
    elif args.sort == SORT_LENGTH:
        matches = sorted(matches, key=itemgetter(2, 0), reverse=True)
    elif args.sort == SORT_CLASS:
        matches = sorted(matches, key=itemgetter(3, 0))

    # Finally, print results in a nice table.
    print(tabulate(matches, headers=headers))
Esempio n. 31
0
word_pair = [' '.join([pair[0], pair[1]]) for pair in bigrams(corpus_tokens)]
pair_fdist = FreqDist(word_pair)


# In[7]:




pmi = {}
tokens_num = len(corpus_tokens)

for pair in word_pair:
    # Get the freq of the pair
    w1w2_freq = pair_fdist.get(pair)
    
    # Only consider bigrams that occur with frequency above that threshold
    if w1w2_freq :
        pair_split = pair.split(' ')
        
        # Get the freq of each of the words pair
        w1_freq = fdist.get(pair_split[0])
        w2_freq = fdist.get(pair_split[1])
        
        # Compute the unigram probabilities in the corpus
        p_w1 = w1_freq / tokens_num
        p_w2 = w2_freq / tokens_num
        
        # Compute the bigram probability
        p_w1w2 = w1w2_freq / w1_freq