Ejemplo n.º 1
0
def unigramtrainingset(a):
    # create frequency distribution of word, tag pairs in the training set
    fd = FreqDist(a)
    # seperate words from tags
    x = [y[0] for y in a]
    # create frequency distribution of words in the training set
    fd2 = FreqDist(x)
    # create list of unique words
    words = unique_list([x[0] for x in fd])
    # create list of unique tags (all possible tags)
    tags = [
        'NOUN', 'ADP', 'ADV', 'NUM', 'VERB', '.', 'PRON', 'DET', 'ADJ', 'PRT',
        'CONJ'
    ]
    # initialise output list
    out = []

    # loop through each unique word
    for word in words:
        # reinitialise tagso list
        tagso = []
        # store frequency of current word
        denom = fd2.freq(word)
        # loop through each tag
        for tag in tags:
            # compute probability of current tag being paired with current word
            prob = fd.freq((word, tag)) / denom
            # create list of tag, probability pairs
            tagso.append((tag, prob))
        # append word, tag-probabilities to out list
        out.append((word, tagso))
    return out
Ejemplo n.º 2
0
def load_book_features(file_name):
    with open(file_name, 'r') as file_handler:
        text = file_handler.read()

    morph = pymorphy2.MorphAnalyzer()

    sentence_list = sent_tokenize(text)

    usual_book_words = []
    sentences_length_dist = []
    words_length_dist = []
    pron_dist = []
    conj_dist = []

    for sentence in sentence_list:
        if sentence != ".":
            pron_count = 0
            conj_count = 0
            sentence_words = re.findall(r"[\w]+", sentence)
            sentences_length_dist.append(len(sentence_words))

            for word in sentence_words:
                words_length_dist.append(len(word))
                if word in NOMINATIVE_PRONOUNS:
                    pron_count += 1
                if morph.parse(word)[0].tag.POS == 'CONJ':
                    conj_count += 1
                if word not in STOPWORDS:
                    usual_book_words.append(word)

            conj_dist.append(conj_count)
            pron_dist.append(pron_count)

    sentence_length_freq_dist = FreqDist(sentences_length_dist)
    sentences_length_dist = [sentence_length_freq_dist.freq(i) for i in range(1, RANGE + 1)]
    sentences_length_dist.append(1 - sum(sentences_length_dist))

    words_length_freq_dist = FreqDist(words_length_dist)
    words_length_dist = [words_length_freq_dist.freq(i) for i in range(1, RANGE + 1)]
    words_length_dist.append(1 - sum(words_length_dist))

    pron_freq_dist = FreqDist(pron_dist)
    pron_dist = [pron_freq_dist.freq(i) for i in range(0, RANGE + 1)]
    pron_dist.append(1 - sum(pron_dist))

    conj_freq_dist = FreqDist(conj_dist)
    conj_dist = [conj_freq_dist.freq(i) for i in range(0, RANGE + 1)]
    conj_dist.append(1 - sum(conj_dist))

    words_freq_dist = FreqDist(usual_book_words)

    num_unique_words = len(words_freq_dist.keys())
    num_total_words = len(usual_book_words)

    hapax = len(words_freq_dist.hapaxes()) / num_unique_words
    dis = len([item for item in words_freq_dist if words_freq_dist[item] == 2]) / num_unique_words
    richness = num_unique_words / num_total_words

    return [hapax, dis, richness, *sentences_length_dist, *words_length_dist, *pron_dist, *conj_dist]
Ejemplo n.º 3
0
def paper_title_NLP(title_corpus):

    # title_corpus is a list of tuple
    # keys like (19,1), means 2019/01
    # value is a list of paper titles after tokenized
    # referece: https://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer
    title_dict = {}
    pattern = r'''(?x)            # set flag to allow verbose regexps
            (?:[A-Z]\.)+          # abbreviations, e.g. U.S.A.
            | \w+(?:-\w+)*        # words with optional internal hyphens
            | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
            | \.\.\.              # ellipsis
            | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
            '''
    tokenizer = RegexpTokenizer(pattern)
    for t in title_corpus:
        key = (t[3], t[4])
        if key in title_dict:
            filterdText = tokenizer.tokenize(t[1])
            title_dict[key].append(filterdText)
        else:
            title_dict[key] = []
            filterdText = tokenizer.tokenize(t[1])
            title_dict[key].append(filterdText)

    # extract keywords with year span
    title_years = {}
    for k, v in title_dict.items():
        key = (k[0], )  # year index
        if key in title_years.keys():
            title_years[key].append(v)
        else:
            title_years[key] = []
            title_years[key].append(v)

    deep_freq = []
    for k, v in title_years.items():
        fd = FreqDist()
        vs = [item for sublist in v for item in sublist]
        for v_ in vs:
            for word in v_:
                fd[word] += 1

        print('The keywords for year:20{}'.format(str(k[0])))
        print("Total number of words:{}".format(str(
            fd.N())))  # total number of samples
        print("Total number of unique words:{}".format(str(
            fd.B())))  # number of bins or unique samples
        fd.pprint(50)  # The maximum number of items to display, default is 10
        deep_freq.append(fd.freq('Deep') + fd.freq('deep'))
        print(deep_freq)

    plt.plot([2012, 2013, 2014, 2015, 2016, 2017, 2018], deep_freq)
    plt.ylabel('frequency of deep word')
    plt.xlabel('years')
    plt.show()
Ejemplo n.º 4
0
def vectorize_string_tfidf(doc, idf):
	words = word_tokenize(doc)
	words = [word.lower() for word in words]
	words = [word for word in words if word not in stops]
	fdist = FreqDist(words)
	
	freqs = []
	# to address sparsity issues: currently uses dictionaries 
	for word in set(words):
		try: freqs += [(word, fdist.freq(word) / idf[word])]
		except KeyError: freqs += [(word, fdist.freq(word))]
	return dict(freqs)
Ejemplo n.º 5
0
    def run(self):
        swear_words = set(utils.stem_words(corpus.swear_words()))

        with self.input().open('rb') as in_file:
            songs = pickle.load(in_file)

        normalized_word_frequencies = {}

        for song in songs:
            dist = FreqDist(song['word_tokens'])

            for sw in swear_words:
                if not sw in normalized_word_frequencies:
                    normalized_word_frequencies[sw] = 0

                normalized_word_frequencies[sw] += dist.freq(sw)

        for w, v in normalized_word_frequencies.items():
            normalized_word_frequencies[w] = v / len(songs)

        df = pd.DataFrame.from_dict(normalized_word_frequencies,
                                    orient='index')
        title = 'Swear Word Frequency\n%s' % (self.artist)
        word_freq = df.nlargest(5, 0)[0:5].plot(kind='bar',
                                                title=title,
                                                legend=False)
        word_freq.set_xlabel("Swear Word")
        word_freq.set_ylabel("Distribution")

        with self.output().open('wb') as out_file:
            word_freq.get_figure().savefig(out_file, dpi='figure')

        clear_plots()
Ejemplo n.º 6
0
def test():
    global N, words, network

    print 'In testing.'

    gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth."""
    tokenizer = RegexpTokenizer('\w+')
    gettysburg_tokens = tokenizer.tokenize(gettysburg) 

    samples = []
    for token in gettysburg_tokens:
        word = token.lower()
        if word not in ENGLISH_STOP_WORDS and word not in punctuation:
            samples.append(word)

    dist = FreqDist(samples)
    V = Vol(1, 1, N, 0.0)
    for i, word in enumerate(words):
        V.w[i] = dist.freq(word)

    pred = network.forward(V).w
    topics = []
    while len(topics) != 5:
        max_act = max(pred)
        topic_idx = pred.index(max_act)
        topic = words[topic_idx]

        if topic in gettysburg_tokens:
            topics.append(topic)
    
        del pred[topic_idx]

    print 'Topics of the Gettysburg Address:'
    print topics
Ejemplo n.º 7
0
    def proto(self, num, language, authors, token_vocab, token_df, lemma_vocab,
              pos_vocab, synset_vocab, stemmer):
        d = Document()
        assert language == self.lang

        if self._id:
            d.id = self._id
        else:
            d.id = num

        d.language = language
        d.title = self.title.strip()
        num_sentences = max(self._sentences) + 1

        tf_token = FreqDist()
        for ii in self.tokens():
            tf_token.inc(ii)

        for ii in xrange(num_sentences):
            s = d.sentences.add()
            for jj in self._sentences[ii]:
                w = s.words.add()
                w.token = token_vocab[jj.word]
                w.lemma = lemma_vocab[jj.lemma]
                w.pos = pos_vocab[jj.pos]
                w.relation = pos_vocab[jj.rel]
                w.parent = jj.parent
                w.offset = jj.offset
                w.tfidf = token_df.compute_tfidf(jj.word,
                                                 tf_token.freq(jj.word))
        return d
Ejemplo n.º 8
0
def max_dist(emoList):
    x = {}
    for e in emoList:
        fd = FreqDist(emoList[e])
        m = fd.max()
        x[m] = fd.freq(m)
    return max(x, key=lambda k: x[k])
Ejemplo n.º 9
0
def vectorize_string(doc):
	words = word_tokenize(doc)
	words = [word.lower() for word in words]
	words = [word for word in words if word not in stops]
	fdist = FreqDist(words)
		
	# to address sparsity issues: currently uses dictionaries 
	freqs = [(word, fdist.freq(word)) for word in set(words)]
	return dict(freqs)
class Models:

    #Constructor
    def __init__(self, corpura):

        corpus = udhr.raw(corpura)

        self.TrainingSet = corpus[0:1000]
        token = list(self.TrainingSet)

        self.Uni = token
        self.Bi = list(nltk.bigrams(token))
        self.Tri = list(nltk.trigrams(token))

        self.UniFreq = FreqDist(self.Uni)
        self.BiFreq = ConditionalFreqDist(self.Bi)
        self.TriFreq = ConditionalFreqDist(
            list(((w1, w2), w3) for w1, w2, w3 in self.Tri))

    #method to calculate Unigrams
    def CalUni(self, Words):
        Words = Words.strip().lower()
        Character = list(Words)

        i = 1
        for a in Character:
            i *= self.UniFreq.freq(a)

        return i

    #method to calculate Bigrams
    def CalBi(self, Words):
        Words = Words.strip().lower()
        Character = list(Words)

        i = 1
        for a, b in enumerate(Character):
            if a == 0:
                continue

            i *= self.BiFreq[Character[a - 1]].freq(b)

        return i

    #method to calculate Trigrams
    def CalTri(self, Words):
        Words = Words.strip().lower()
        Character = list(Words)

        i = 1
        for a, b in enumerate(Character):
            if a <= 1:
                continue
            i *= self.TriFreq[(Character[a - 2], Character[a - 1])].freq(b)

        return i
Ejemplo n.º 11
0
    def doc_tfidf(self, doc: str) -> Dict[Tuple[str, int], float]:
        """Given a document, create a dictionary representation of its tfidf vector

        doc -- raw string of the document"""

        counts = FreqDist(self.tokenize(doc))
        d = {}
        for ii in self._tokenizer(doc):
            ww = self.vocab_lookup(ii)
            d[(ww, ii)] = counts.freq(ww) * self.inv_docfreq(ww)
        return d
Ejemplo n.º 12
0
def extract_ngrams(text,
                   low=1,
                   high=2,
                   lowercase=False,
                   filter_punctuation=True,
                   binary=False,
                   least_common=None,
                   most_common=None,
                   normalize=False,
                   sample=False):
    #text = ' '.join(review.paragraphs)
    tokens = None

    # Make lowercase
    if lowercase:
        tokens = word_tokenize(text.lower())
    else:
        tokens = word_tokenize(text)

    # Remove Punctuation
    if filter_punctuation:
        words = [t for t in tokens if t not in PUNCTUATION]
    else:
        words = [t for t in tokens]

    # Do the N Gram Thing
    ngram_counts = {}
    assert not (
        sample and binary
    ), "Please don't make sample and binary True. One or the other or neither pls"
    for n in range(low, high + 1):
        ngram_freqdist = FreqDist(ngrams(words, n))
        grams_to_consider = ngram_freqdist
        if least_common:
            assert least_common > 0.0 and least_common <= 1.0, \
                    'Least common must be a proportion, not %.3f' % least_common
            num_least_common = int(least_common * ngram_freqdist.N())
            grams_to_consider = []
            for bleh in ngram_freqdist.most_common()[-1 * num_least_common:]:
                gram, count = bleh
                grams_to_consider.append(gram)
        for gram in grams_to_consider:
            if sample:
                ngram_counts[gram] = ngram_freqdist.freq(gram)
            elif binary:
                ngram_counts[gram] = True
            else:
                ngram_counts[gram] = ngram_freqdist[gram]
    if normalize:
        total_counts = sum(count for ngram, count in ngram_counts.items())
        for gram, count in ngram_counts.items():
            ngram_counts[gram] = count / total_counts
    return ngram_counts
Ejemplo n.º 13
0
def model_select(sentences, seq, vocab_size):
    if len(sentences) > 50:
        fd = FreqDist(seq)
        frequencies = np.fromiter((fd.freq(i) for i in range(vocab_size)),
                                  dtype=np.float64)
        emission_prob = np.stack([frequencies] * 8)

        model = hmm.MultinomialHMM(n_components=8, init_params='st')
        model.emissionprob_ = emission_prob
    else:
        model = hmm.MultinomialHMM(n_components=8, init_params='ste')

    return model
Ejemplo n.º 14
0
def calc_metrics_for_one_sample(id_):
    # Identify the file with the relevant counts
    rfh = open(COUNTS_DIR + "counts-" + id_, "r")

    # Adapted from the BitCounter's method get_freq_dists
    d_freq_dist = FreqDist()
    r_freq_dist = FreqDist()
    # Skip header
    rfh.readline()
    line = rfh.readline()
    while line.strip():
        party, phrase, count = line.strip().split("|")
        assert party in ("D", "R")
        count = int(count)
        if party == "D":
            d_freq_dist[phrase] += count
        else:
            r_freq_dist[phrase] += count
        line = rfh.readline()
    vocab = list(set(d_freq_dist.keys()).union(set(r_freq_dist.keys())))
    # L1 smoothing
    for phrase in vocab:
        d_freq_dist[phrase] += 1
        r_freq_dist[phrase] += 1

    # Adapted from the BitCounter's method get_signal
    # N.B. If denom == "q" is passed to get_signal, get_signal *should* be
    # redundant to get_log_odds with a change of base
    signal = lambda pi, qi: math.log(pi / qi, 2)
    signals = []
    for phrase in vocab:
        ds = signal(d_freq_dist.freq(phrase), r_freq_dist.freq(phrase))
        rs = signal(r_freq_dist.freq(phrase), d_freq_dist.freq(phrase))
        signals.append((phrase, ds, rs))
    df = pd.DataFrame(signals)
    df.columns = ["term", "dmetric", "rmetric"]
    df.set_index("term", inplace=True)
    df.to_pickle(METRICS_DIR + "signals-unigrams-" + id_)
Ejemplo n.º 15
0
	def parse(self, response):
		"""
		The lines below is a spider contract. For more info see:
		http://doc.scrapy.org/en/latest/topics/contracts.html
		
		@url https://www.google.com/search?q=personal+nutrition
		@scrapes pages to depth<=3, using priority-score based BFS
		"""
		
		doc = clean_html(response.body_as_unicode())
		words = word_tokenize(doc)
		words = [word.lower() for word in words]
		words = [word for word in words if word not in self.stops]
		fdist = FreqDist(words)
		
		for word in set(words):
			if (fdist.freq(word) * fdist.N()) > 1:
				item = WordCount()
				item['word'] = word
				item['count'] = int(fdist.freq(word) * fdist.N())
				yield item 
		#for href in response.css("a::attr('href')"):
		#	url = response.urljoin(href.extract())
		#	yield scrapy.Request(url, callback=self.parse)
Ejemplo n.º 16
0
    def parse(self, response):
        """
		The lines below is a spider contract. For more info see:
		http://doc.scrapy.org/en/latest/topics/contracts.html
		
		@url https://www.google.com/search?q=personal+nutrition
		@scrapes pages to depth<=3, using priority-score based BFS
		"""

        doc = clean_html(response.body_as_unicode())
        words = word_tokenize(doc)
        words = [word.lower() for word in words]
        words = [word for word in words if word not in self.stops]
        fdist = FreqDist(words)

        for word in set(words):
            if (fdist.freq(word) * fdist.N()) > 1:
                item = WordCount()
                item['word'] = word
                item['count'] = int(fdist.freq(word) * fdist.N())
                yield item
        #for href in response.css("a::attr('href')"):
        #	url = response.urljoin(href.extract())
        #	yield scrapy.Request(url, callback=self.parse)
Ejemplo n.º 17
0
def bigramTags(a):
    #create lit of all tags
    tags = [x[1] for x in a]
    #create list of tag bigrams
    btags = [(tags[i], tags[i + 1]) for i in range(len(tags) - 1)]
    #create frequency distribution of bigram tags
    btagsf = FreqDist(btags)
    #create list of unique bigram tags
    btagscombo = [(x, y) for x in unique_list(tags) for y in unique_list(tags)]
    out = []
    #loop through unique bigram tags
    for i in range(len(btagscombo)):
        #add bigran tag with frequency probability to list
        out.append((btagscombo[i], btagsf.freq(btagscombo[i])))
    return out
Ejemplo n.º 18
0
class LangModel:
    def __init__(self, file):
        corpus = udhr.raw(file)
        self.training_set = corpus[0:1000]
        token = list(self.training_set)
        self.unigram = token
        self.bigram = list(nltk.bigrams(token))
        self.trigram = list(nltk.trigrams(token))
        self.unigram_frequency = FreqDist(self.unigram)
        self.bigram_frequency = ConditionalFreqDist(self.bigram)
        self.trigam_frequency = ConditionalFreqDist(
            list(((x, y), z) for x, y, z in self.trigram))

#Creating a function cal_unigram for calculating the probability of each character in Uniigram model

    def cal_unigram(self, words):
        words = words.strip().lower()
        character = list(words)
        p = 1
        for n in character:
            p = p * self.unigram_frequency.freq(n)
        return p

#Creating a function cal_bigram for calculating the probability of each character in Bigram model

    def cal_bigram(self, words):
        words = words.strip().lower()
        character = list(words)
        p = 1
        for m, n in enumerate(character):
            if m == 0:
                continue
            p = p * self.bigram_frequency[character[m - 1]].freq(n)
        return p

#Creating a function cal_trigram for calculating the probability of each character in Trigram model

    def cal_trigram(self, words):
        words = words.strip().lower()
        character = list(words)
        p = 1
        for m, n in enumerate(character):
            if m <= 1:
                continue
            p = p * self.trigam_frequency[(character[m - 2],
                                           character[m - 1])].freq(n)
        return p
Ejemplo n.º 19
0
def extract_doc_feats(refactorized_documents):
    from nltk import FreqDist
    from collections import defaultdict
    import itertools
    import math
    import pdb
    import numpy

    doc_num = len(refactorized_documents)

    occurences = defaultdict(lambda: 0)
    for doc in refactorized_documents:
        for x in set(doc): occurences[x] += 1

    ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents))
    glob_freqs = FreqDist(ref_docs_flat)

    tokens = glob_freqs.samples()
    glob_features = [{}]*doc_num


    for i in range(0, doc_num):
        doc_features = [0]*len(tokens)
        doc_freqs = FreqDist(refactorized_documents[i])
        doc_len = len(refactorized_documents[i])

        for (tok,num) in doc_freqs.items():
            max_doc_freq = doc_freqs.freq(doc_freqs.max())*float(doc_len)

            # augmented
            #tf = 0.5 + (0.5*float(num)) / float(max_doc_freq)
            tf = 1+math.log(num,10)
            idf = math.log( float(doc_num) / (float(occurences[tok])) ,10)
            tfidf = tf*idf

            indx = tokens.index(tok)
            doc_features[indx] = tfidf

        f_tmp = numpy.asarray(doc_features)
        f_tmp = f_tmp/(numpy.linalg.norm(f_tmp)+numpy.finfo(float).eps)
        glob_features[i] = f_tmp.tolist()

    glob_features = numpy.asarray(glob_features)*glob_freqs.N()
    print "Glob Freqs:", glob_freqs.N()

    return (glob_features,tokens)
Ejemplo n.º 20
0
class NumTranslationsFeatureExtractor(FeatureExtractor):

    # .f2e file
    def __init__(self, lex_prob_file, corpus_file):
        self.lex_prob = defaultdict(list)
        for line in open(lex_prob_file):
            chunks = line[:-1].split()
            self.lex_prob[chunks[1]].append(float(chunks[2]))
        corpus = TextCorpus(input=corpus_file)
        self.corpus_freq = FreqDist(
            [word for line in corpus.get_texts() for word in line])
        self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5]

    def get_features(self, context_obj):
        if 'source_token' not in context_obj or len(
                context_obj['source_token']) == 0:
            return [0.0 for i in range(len(self.thresholds) * 2)]

        translations, translations_weighted = [], []
        for thr in self.thresholds:
            all_words, all_words_weighted = [], []
            for word in context_obj['source_token']:
                trans = [fl for fl in self.lex_prob[word] if fl >= thr]
                all_words.append(len(trans))
                all_words_weighted.append(
                    len(trans) * self.corpus_freq.freq(word))
            translations.append(np.average(all_words))
            translations_weighted.append(np.average(all_words_weighted))
        return translations + translations_weighted

    def get_feature_names(self):
        return [
            'source_translations_001_freq', 'source_translations_005_freq',
            'source_translations_01_freq', 'source_translations_02_freq',
            'source_translations_05_freq',
            'source_translations_001_freq_weighted',
            'source_translations_005_freq_weighted',
            'source_translations_01_freq_weighted',
            'source_translations_02_freq_weighted',
            'source_translations_05_freq_weighted'
        ]
    def compute_features(self, s, count):

        # preprocess
        tok_sent = nltk.tokenize.word_tokenize(s)
        stop_tok_sent = [x for x in tok_sent if x not in cachedStopWords]

        # location features
        P = 1.0/count
        F5 = 1 if count <=5 else 0
        LEN = len(stop_tok_sent)/30.0

        # language modelling
        LM = LModel.score(s)

        # pos tagging features
        tag_fd = FreqDist(map_tag("en-ptb", "universal",tag) if map_tag("en-ptb", "universal",tag) not in cachedStopPOStags else "OTHER" for (word, tag) in pos_tagger(tok_sent))
        NN = tag_fd.freq("NOUN")
        VB = tag_fd.freq("VERB")

        # headline-sentence similarity
        VS1 = 1 - spatial.distance.cosine(self.hl_vsv_1.toarray(), self.father.cv.transform([s]).toarray())
        TFIDF = 1 - spatial.distance.cosine(self.hl_tfidf.toarray(), self.father.tv.transform([s]).toarray())

        # topic description-sentence similarity
        CT = 1 - spatial.distance.cosine(self.father.desc_vsv.toarray(), self.father.cv.transform([s]).toarray())
        Q = 1 - spatial.distance.cosine(self.father.title_vsv.toarray(), self.father.cv.transform([s]).toarray())

        # security checks
        if math.isnan(VS1):
            VS1 = 0
            print self.father.code, self.id
        if math.isnan(CT):
            CT = 0
            print self.father.code, self.id
        if math.isnan(Q):
            Q = 0
            print self.father.code, self.id

        # active features
        return np.asarray([P, F5, LEN, LM, VS1, TFIDF, VB, NN, CT, Q])
Ejemplo n.º 22
0
 def getdict(self, content):
   wnl = nltk.WordNetLemmatizer()
   begin = clock()
   print('begin')
   tokens = nltk.word_tokenize(content)
   wordlist = nltk.corpus.words.words()
   stopwords = nltk.corpus.stopwords.words('english')
   fdist = FreqDist(wnl.lemmatize(wnl.lemmatize(wnl.lemmatize(word.lower(),'a')), 'v') for word in tokens if word.isalpha() and word not in stopwords)
   print(clock() - begin)
   js = {'samples': fdist.B(), 'outcomes': fdist.N()}
   wdict = {}
   count = 1
   begin = clock()
   for w in fdist.most_common():
     d = {'index': count, 'word': w[0], 'count': w[1], 'freq': round(fdist.freq(w[0]), 4)}
     d['basic'] = self.getexp(w[0])
     wdict[w[0]] = d
     count = count + 1
   print(clock() - begin)
   wdict = sorted(wdict.items(),key=lambda t: t[1]['index'])
   js['words'] = wdict
   return js
class NumTranslationsFeatureExtractor(FeatureExtractor):

    # .f2e file
    def __init__(self, lex_prob_file, corpus_file):
        self.lex_prob = defaultdict(list)
        for line in open(lex_prob_file):
            chunks = line[:-1].split()
            self.lex_prob[chunks[1]].append(float(chunks[2]))
        corpus = TextCorpus(input=corpus_file)
        self.corpus_freq = FreqDist([word for line in corpus.get_texts() for word in line])
        self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5]

    def get_features(self, context_obj):
        if 'source_token' not in context_obj or len(context_obj['source_token']) == 0:
            return [0.0 for i in range(len(self.thresholds)*2)]

        translations, translations_weighted = [], []
        for thr in self.thresholds:
            all_words, all_words_weighted = [], []
            for word in context_obj['source_token']:
                trans = [fl for fl in self.lex_prob[word] if fl >= thr]
                all_words.append(len(trans))
                all_words_weighted.append(len(trans)*self.corpus_freq.freq(word))
            translations.append(np.average(all_words))
            translations_weighted.append(np.average(all_words_weighted))
        return translations + translations_weighted

    def get_feature_names(self):
        return ['source_translations_001_freq',
                'source_translations_005_freq',
                'source_translations_01_freq',
                'source_translations_02_freq',
                'source_translations_05_freq',
                'source_translations_001_freq_weighted',
                'source_translations_005_freq_weighted',
                'source_translations_01_freq_weighted',
                'source_translations_02_freq_weighted',
                'source_translations_05_freq_weighted']
Ejemplo n.º 24
0
class TextCollection:
    """
    A collection of words that supports various Python operations.
    This is constructed by passing in an iterable of words.

    >>> tc = TextCollection(['hello', 'world'])
    >>> 'hello' in tc
    True
    >>> tc.freq('world')
    0.5
    """
    def __init__(self, words):
        words = normalize(words)
        self.words = list(words)
        self.lexicon = set(self.words)
        self.fdist = FreqDist(self.words)

    def __contains__(self, word):
        return word in self.lexicon

    def __iter__(self):
        return iter(self.words)

    def __len__(self):
        return len(self.words)

    def words(self):
        return iter(self.words)

    def count(self, word):
        return self.fdist[word]

    def freq(self, word):
        return self.fdist.freq(word)

    def wordcounts(self):
        return self.fdist.items()
Ejemplo n.º 25
0
class BitCounter(object):
    def __init__(self, mode):
        self.mode = mode
        self.regex = r'%s-[0-9]{4}.txt' % self.mode
        self.get_freq_dists()

    def get_freq_dists(self):
        print 'Counting word occurrences...'

        self.d_freq_dist = FreqDist()
        self.r_freq_dist = FreqDist()

        for entry in os.listdir(NGRAM_DIR):
            if isinstance(re.match(self.regex, entry), type(None)):
                continue
            print 'Processing {}...'.format(entry)
            with open(NGRAM_DIR + entry, 'r') as fh:
                # Skip header
                fh.readline()
                line = fh.readline()
                while line.strip():
                    party, phrase, count = line.strip().split('|')
                    if party not in ('D', 'R'):
                        line = fh.readline()
                        continue
                    count = int(count)
                    if party == 'D':
                        self.d_freq_dist[phrase] += count
                    elif party == 'R':
                        self.r_freq_dist[phrase] += count
                    line = fh.readline()

        self.vocab = list(
            set(self.d_freq_dist.keys()).union(set(self.r_freq_dist.keys())))

        # L1 smoothing
        for phrase in self.vocab:
            self.d_freq_dist[phrase] += 1
            self.r_freq_dist[phrase] += 1

    def get_frequencies(self, save=True):
        print 'Getting frequencies...'
        frequencies = []
        for phrase in self.vocab:
            frequencies.append((phrase, int(self.d_freq_dist[phrase]),
                                int(self.r_freq_dist[phrase])))
        df = pd.DataFrame(frequencies, columns=["term", "dmetric",
                                                "rmetric"]).set_index("term")
        df["dmetric_std"] = stats.mstats.zscore(df["dmetric"])
        df["rmetric_std"] = stats.mstats.zscore(df["rmetric"])
        if save:
            df.to_pickle(METRICS_DIR + "frequencies-" + self.mode)
        return df

    def get_partial_kl(self, denom="q", pfun=None, save=True):
        assert denom in ("mixture", "q")
        if save:
            assert denom == "q"
            save = "partial_kls" if save == True else save

        partial_kl_mixture = lambda p, pi, qi: p * math.log(
            2 * pi / (pi + qi), 2)
        partial_kl_q = lambda p, pi, qi: p * math.log(pi / qi, 2)
        partial_kl = partial_kl_q if denom == "q" else partial_kl_mixture

        print 'Computing partial KLs...'
        pkls = []
        for phrase in self.vocab:
            dp = self.d_freq_dist.freq(phrase)
            rp = self.r_freq_dist.freq(phrase)
            dscale = dp if not pfun else pfun(phrase)
            rscale = rp if not pfun else pfun(phrase)
            dpkl = partial_kl(dscale, dp, rp)
            rpkl = partial_kl(rscale, rp, dp)
            pkls.append((phrase, dpkl, rpkl))
        df = pd.DataFrame(pkls, columns=["term", "dmetric",
                                         "rmetric"]).set_index("term")
        df["dmetric_std"] = stats.mstats.zscore(df["dmetric"])
        df["rmetric_std"] = stats.mstats.zscore(df["rmetric"])
        if save:
            df.to_pickle(METRICS_DIR + save + "-" + self.mode)
        return df

    def get_signal(self, denom="q", save=True):
        assert denom in ("mixture", "q")
        if save:
            assert denom == "q"

        signal_mixture = lambda pi, qi: math.log(2 * pi / (pi + qi), 2)
        signal_q = lambda pi, qi: math.log(pi / qi, 2)
        signal = signal_q if denom == "q" else signal_mixture

        print 'Computing signal reliability...'
        signals = []
        for phrase in self.vocab:
            dsr = signal(self.d_freq_dist.freq(phrase),
                         self.r_freq_dist.freq(phrase))
            rsr = signal(self.r_freq_dist.freq(phrase),
                         self.d_freq_dist.freq(phrase))
            signals.append((phrase, dsr, rsr))
        df = pd.DataFrame(signals, columns=["term", "dmetric",
                                            "rmetric"]).set_index("term")
        df["dmetric_std"] = stats.mstats.zscore(df["dmetric"])
        df["rmetric_std"] = stats.mstats.zscore(df["rmetric"])
        if save:
            df.to_pickle(METRICS_DIR + "signals-" + self.mode)
        return df

    def get_logps(self, save=True):
        logp = lambda pi: math.log(pi, 2)

        print 'Computing log p\'s...'
        logps = []
        for phrase in self.vocab:
            dlp = logp(self.d_freq_dist.freq(phrase))
            rlp = logp(self.r_freq_dist.freq(phrase))
            logps.append((phrase, dlp, rlp))
        df = pd.DataFrame(logps, columns=["term", "dmetric",
                                          "rmetric"]).set_index("term")
        df["dmetric_std"] = stats.mstats.zscore(df["dmetric"])
        df["rmetric_std"] = stats.mstats.zscore(df["rmetric"])
        if save:
            df.to_pickle(METRICS_DIR + "logps-" + self.mode)
        return df

    def get_mixtures(self, save=True):
        print 'Computing mixtures...'
        ms = []
        for phrase in self.vocab:
            m = (self.d_freq_dist.freq(phrase) +
                 self.r_freq_dist.freq(phrase)) / 2
            ms.append((phrase, m, m))
        df = pd.DataFrame(ms, columns=["term", "dmetric",
                                       "rmetric"]).set_index("term")
        df["dmetric_std"] = stats.mstats.zscore(df["dmetric"])
        df["rmetric_std"] = stats.mstats.zscore(df["rmetric"])
        if save:
            df.to_pickle(METRICS_DIR + "ms-" + self.mode)
        return df

    def get_probs(self, save=True):
        print 'Computing raw probabilities...'
        probs = []
        for phrase in self.vocab:
            probs.append((phrase, self.d_freq_dist.freq(phrase),
                          self.r_freq_dist.freq(phrase)))
        df = pd.DataFrame(probs, columns=["term", "dmetric",
                                          "rmetric"]).set_index("term")
        df["dmetric_std"] = stats.mstats.zscore(df["dmetric"])
        df["rmetric_std"] = stats.mstats.zscore(df["rmetric"])
        if save:
            df.to_pickle(METRICS_DIR + "probs-" + self.mode)
        return df

    # TODO: Get rid of this function; I think it's redundant with get_signal().
    def get_log_odds(self, save=True):
        print 'Computing log odds...'
        logodds = []
        for phrase in self.vocab:
            logodds.append((phrase,
                            math.log(
                                self.d_freq_dist.freq(phrase) /
                                self.r_freq_dist.freq(phrase)),
                            math.log(
                                self.r_freq_dist.freq(phrase) /
                                self.d_freq_dist.freq(phrase))))
        df = pd.DataFrame(logodds, columns=["term", "dmetric",
                                            "rmetric"]).set_index("term")
        df["dmetric_std"] = stats.mstats.zscore(df["dmetric"])
        df["rmetric_std"] = stats.mstats.zscore(df["rmetric"])
        if save:
            df.to_pickle(METRICS_DIR + "logodds-" + self.mode)
        return df

    def get_conditional_probs(self, save=True):
        print 'Computing conditional probabilities...'
        cond_probs = []
        n_d, n_r = sum(self.d_freq_dist.values()), \
                   sum(self.r_freq_dist.values())
        n = n_d + n_r
        for phrase in self.vocab:
            marg_prob = (self.d_freq_dist[phrase] +
                         self.r_freq_dist[phrase]) / n
            cp_d, cp_r = np.multiply(
                np.array([
                    self.d_freq_dist.freq(phrase),
                    self.r_freq_dist.freq(phrase)
                ]), np.array([n_d, n_r])) / (marg_prob * n)
            #cp_d, cp_r = np.multiply(np.array([ self.d_freq_dist.freq(phrase),
            #                                    self.r_freq_dist.freq(phrase) ]),
            #                         np.array([ 1, 1 ])) / ( marg_prob * 2 )
            cond_probs.append((phrase, cp_d, cp_r))
        df = pd.DataFrame(cond_probs, columns=["term", "dmetric",
                                               "rmetric"]).set_index("term")
        df["dmetric_std"] = stats.mstats.zscore(df["dmetric"])
        df["rmetric_std"] = stats.mstats.zscore(df["rmetric"])
        if save:
            df.to_pickle(METRICS_DIR + "cond_probs-" + self.mode)
        return df

    def get_likelihood_ratios(self, save=True):
        print 'Computing likelihood ratios...'
        lrs = []
        n_d, n_r = sum(self.d_freq_dist.values()), \
                   sum(self.r_freq_dist.values())
        n = n_d + n_r
        for phrase in self.vocab:
            dp, rp = self.d_freq_dist.freq(phrase), self.r_freq_dist.freq(
                phrase)
            lrs.append((phrase, dp / rp, rp / dp))
        df = pd.DataFrame(lrs, columns=["term", "dmetric",
                                        "rmetric"]).set_index("term")
        df["dmetric_std"] = stats.mstats.zscore(df["dmetric"])
        df["rmetric_std"] = stats.mstats.zscore(df["rmetric"])
        if save:
            df.to_pickle(METRICS_DIR + "likelihood_ratios-" + self.mode)
        return df

    def _get_valence(self, phrase):
        v, a, d = get_valence(phrase)
        return (phrase, v, a, d)

    def get_valence(self, save=True):
        print "Getting valence..."
        vals = [self._get_valence(phrase) for phrase in self.vocab]
        df = pd.DataFrame(vals,
                          columns=["term", "valence", "arousal",
                                   "dominance"]).set_index("term")
        if save:
            df.to_pickle(METRICS_DIR + "valence-" + self.mode)
        return df

    def get_all(self, save=True):
        self.get_frequencies(save=save)
        self.get_partial_kl(save="partial_kls" if save else save)
        self.get_signal(save=save)
        self.get_probs(save=save)
        self.get_log_odds(save=save)
        self.get_conditional_probs(save=save)
Ejemplo n.º 26
0
class DirichletWords(object):

  def initialize_index(self):
    self.word_to_int = {}
    self.int_to_word = {}


  def __init__(self, num_topics, alpha_topic = 1.0, alpha_word = 1.0, 
               max_tables = 50000, sanity_check=False, initialize=False,
               report_filename="topic_history.txt"):

    self.max_tables = max_tables
    self._alphabet = FreqDist()
    # store all words seen in a list so they are associated with a unique ID.

    self.initialize_index()

    self._words = FreqDist()

    self.alpha_topic = alpha_topic
    self.alpha_word = alpha_word

    self._num_updates = 0
    self._report = None
    if report_filename:
        self._report = open(report_filename, 'w')

    self.num_topics = num_topics
    self._topics = [FreqDist() for x in xrange(num_topics)]

    # the sanity_check flag is for testing only. 
    if initialize and sanity_check == True:
        self.deterministic_seed()
    elif initialize:
        self.initialize_topics()

  def deterministic_seed(self):
    ''' if sanity_check = True, this will seed the topics with enough variance
    to evolve but do so in the most basic and deterministic way possible, so a
    user can follow along each step of the algorithm'''

    chars = "abcdefghijklmnopqrstuvwxyz"
    for i in xrange(3):
      word = random.choice(chars)
      self.index(word)
      topic_weights = probability_vector(self.num_topics)
      for k in xrange(self.num_topics):
        self.update_count(word, k, topic_weights[k])

  def initialize_topics(self):
    ''' initializes the topics with some random seed words so that they have
        enough relative bias to evolve when new words are passed in.  '''
    # we are going to create some random string from /dev/urandom. to convert
    # them to a string, we need a translation table that is 256 characters. 
    translate_table = (string.letters*5)[:256]
    # /dev/urandom is technically not as random as /dev/random, but it doesn't
    # block. 
    r = open('/dev/urandom')
    # make random 'words' and add them to the topics. they'll never
    # realistically be seen again- which is good since we just want them to
    # seed the bias in the topics. 
    for i in xrange(self.num_topics):
        word_length = random.randint(9,20)
        word = r.read(word_length).translate(translate_table)
        self.index(word)
        topic_weights = probability_vector(self.num_topics)
        for k in xrange(self.num_topics):
            self.update_count(word, k, topic_weights[k])
    r.close()

  def __len__(self):
    return len(self._words)

  def num_words(self):
      return sum(1 for x in self._words if self._words[x] >= 1)

  def as_matrix(self):
    ''' Return a matrix of the probabilities of all words over all topics.
        note that because we are using topic_prob(), this is equivalent to he
        expectation of log beta, ie Elogbeta '''
    
    #  XXX TODO we should store this on the fly instead of recomputing it
    #  all the time!

    # create a numpy array here because that's what the e_step in streamLDA
    # expects

    num_words = self.num_words()
    print("%i words" % num_words)
    lambda_matrix = n.zeros((self.num_topics, num_words))

    for word_index, word in enumerate(x for x in self._words \
                                      if self._words[x] >= 1):
        topic_weights = [log(self.topic_prob(k, word)) \
                         for k in xrange(self.num_topics)]

        # topic weights for this word-- a column vector. 
        lambda_matrix[:,word_index] = topic_weights

    self._num_updates += 1
    if self._report:
        self._report.write("%i %i %i %i\n" % (self._num_updates,
                                              len(self._alphabet), \
                                              len(self._words),
                                              sum(x.B() for x in self._topics)))
        
    return lambda_matrix



  def forget(self, proportion):

    num_tables = len(self._words)      
    number_to_forget = proportion * num_tables
    if num_tables > self.max_tables:
      number_to_forget += (num_tables - self.max_tables)
    
    # change this to weight lower probability
    tables_to_forget = random.sample(xrange(num_tables), number_to_forget)
    words = self._words.keys()

    self.initialize_index()

    word_id = -1
    for ii in words:
      word_id += 1

      if not word_id in tables_to_forget:
        self.index(ii)
        continue

      count = self._words[ii]
      for jj in self._topics:
        self._topics[jj][ii] = 0
        del self._topics[jj][ii]

      for jj in ii:
        self._chars[jj] -= count
      self._words[ii] = 0
      del self._words[ii]

  def seq_prob(self, word):
    val = 1.0

    # Weighted monkeys at typewriter
    for ii in word:
      # Add in a threshold to make sure we don't have zero probability sequences
      val *= max(self._alphabet.freq(ii), CHAR_SMOOTHING) 

    # Normalize
    val /= 2**(len(word))
    return val

  def merge(self, otherlambda, rhot):
    ''' fold the word counts in another DirichletWords object into this
        one, weighted by rhot. assumes self.num_topics is the same for both
        objects. '''
    
    all_words = self._words.keys() + otherlambda._words.keys()
    distinct_words = list(set(all_words))

    # combines the probabilities, with otherlambda weighted by rho, and
    # generates a new count by combining the number of words in the old
    # (current) lambda with the number in the new. here we essentially take
    # the same steps as update_count but do so explicitly so we can weight the
    # terms appropriately. 
    total_words = float(self._words.N() + otherlambda._words.N())

    self_scale = (1.0-rhot)*total_words/float(self._words.N())
    other_scale = rhot*total_words/float(otherlambda._words.N())

    for word in distinct_words:
      self.index(word)
        
      # update word counts
      new_val = (self_scale*self._words[word] 
                 + other_scale*otherlambda._words[word])
      if new_val >= 1.0:
          self._words[word] = new_val
      else:
          self._words[word] = 0
          del self._words[word]
      
      # update topic counts
      for topic in xrange(self.num_topics):
        new_val = (self_scale*self._topics[topic][word] 
                   + other_scale*otherlambda._topics[topic][word])
        if new_val >= 1.0:
            self._topics[topic][word] = new_val
        else:
            self._topics[topic][word] = 0
            del self._topics[topic][word]
         
    # update sequence counts
    all_chars = self._alphabet.keys() + otherlambda._alphabet.keys()
    distinct_chars = list(set(all_chars))
 
    for ii in distinct_chars:
      self._alphabet[ii] = (self_scale*self._alphabet[ii] 
                            + other_scale*otherlambda._alphabet[ii])

  def word_prob(self, word):
    return (self._words[word] + self.alpha_word * self.seq_prob(word)) / \
           (self._words.N() + self.alpha_word)

  def topic_prob(self, topic, word):
    return (self._topics[topic][word] + \
            self.alpha_topic * self.word_prob(word)) / \
            (self._topics[topic].N() + self.alpha_topic)

  def update_count(self, word, topic, count):
    # create an index for the word
    self.index(word)
      
    # increment the frequency of the word in the specified topic
    self._topics[topic][word] += count
    # also keep a separate frequency count of the number of times this word has
    # appeared, across all documents. 
    self._words[word] += count
    # finally, keep track of the appearance of each character.
    # note that this does not assume any particular character set nor limit
    # recognized characters. if words contain punctuation, etc. then they will
    # be counted here. 
    for ii in word:
      self._alphabet[ii] += count

  def index(self, word):
      assert not isinstance(word, int)

      if not word in self.word_to_int:
          self.word_to_int[word] = len(self.word_to_int)
          self.int_to_word[self.word_to_int[word]] = word

      return self.word_to_int[word]

  def dictionary(self, word_id):
      assert isinstance(word_id, int)

      return self.int_to_word[word_id]

  def print_probs(self, word):
    print "----------------"
    print word
    for ii in xrange(self.num_topics):
      print ii, self.topic_prob(ii, word)
    print "WORD", self.word_prob(word)
    print "SEQ", self.seq_prob(word)
# coding: utf-8
import nltk
from nltk.corpus import gutenberg  # 导入 gutenberg 集
##################################################################
## FreqDist 跟踪分布中的采样频率 (sample frequencies)
from nltk import FreqDist  # 导入 FreqDist 类
fd = FreqDist(gutenberg.words('austen-persuasion.txt'))  # 频率分布实例化, 统计文本中的 Token
print(fd)  # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token
print(type(fd))  # <class 'nltk.probability.FreqDist'>
print(fd['the'])  # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典
print(fd.N())  # 98171; 是单词, 不是字母, 有重复的
print(fd.B())  # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中
print(len(fd.keys()), type(fd.keys()))  # 6132 <class 'dict_keys'>
print(fd.keys())  # fd.B() 只是输出个数, 这个是把所有词汇表输出
print(fd.max())  # 频率最高的一个词
print(fd.freq('the'))  # 0.03178127960395636; 出现频率 3120 / 98171
print(fd.hapaxes())  # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词
# 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征
for idx, word in enumerate(fd):  # 可以用 enumerate 来遍历, 是按出现顺序排的
    if idx == 5: break
    print(idx, word)  # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist)  # <FreqDist with 16 samples and 98171 outcomes>
print(fdist.items())  # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)])
print(fdist.most_common(3))  # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################
## 统计 英文字符
fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha())  # 可以不用 [] 将生成器 list 化
print(fdist.most_common(5))  # [('e', 46949), ('t', 32192), ('a', 29371), ('o', 27617), ('n', 26718)]
Ejemplo n.º 28
0
 def test_freq_freqdist(self):
     """Probabilities are indentical to using FreqDist."""
     freqdist = FreqDist(TEST_TOKENS)
     for word_type in set(TEST_TOKENS):
         self.assertEqual(self.model.prob(word_type, None),
                          freqdist.freq(word_type))
def load_book_features(filename, smartStopWords={}, pronSet={}, conjSet={}):
    '''
    Load features for each book in the corpus. There are 4 + RANGE*4 features
    for each instance. These features are:
       ---------------------------------------------------------------------------------------------------------
       No. Feature Name                                                                         No. of features.
       ---------------------------------------------------------------------------------------------------------
       1.  number of hapax legomena divided by number of unique words                           1
       2.  number of dis legomena divided by number of unique words                             1
       3.  number of unique words divided by number of total words                              1
       4.  flesch readability score divided by 100                                              1

       5.  no. of sentences of length in the range [1, RANGE] divided by the                    RANGE
           number of total sentences
       6.  no. of words of length in the range [1, RANGE] divided by the                        RANGE
           number of total words
       7.  no. of nominative pronouns per sentence in the range [1, RANGE] divided by the       RANGE
           number of total sentences
       8.  no. of (coordinating + subordinating) conjunctions per sentence in the range         RANGE
           [1, RANGE] divided by the number of total sentences
    '''

    text = extract_book_contents(open(filename, 'r').read()).lower()

    contents = re.sub('\'s|(\r\n)|-+|["_]', ' ', text) # remove \r\n, apostrophes, and dashes
    sentenceList = sent_tokenize(contents.strip())

    cleanWords = []
    sentenceLenDist = []
    pronDist = []
    conjDist = []
    sentences = []
    totalWords = 0
    wordLenDist = []
    totalSyllables = 0
    for sentence in sentenceList:
        if sentence != ".":
            pronCount = 0
            conjCount = 0
            sentences.append(sentence)
            sentenceWords = re.findall(r"[\w']+", sentence)
            totalWords += len(sentenceWords) # record all words in sentence
            sentenceLenDist.append(len(sentenceWords)) # record length of sentence in words
            for word in sentenceWords:
                totalSyllables += count(word)
                wordLenDist.append(len(word)) # record length of word in chars
                if word in pronSet:
                    pronCount+=1 # record no. of pronouns in sentence
                if word in conjSet:
                    conjCount+=1 # record no. of conjunctions in sentence
                if word not in smartStopWords:
                    cleanWords.append(word)
            pronDist.append(pronCount)
            conjDist.append(conjCount)

    sentenceLengthFreqDist = FreqDist(sentenceLenDist)
    sentenceLengthDist = map(lambda x: sentenceLengthFreqDist.freq(x), range(1, RANGE))
    sentenceLengthDist.append(1-sum(sentenceLengthDist))

    pronounFreqDist = FreqDist(pronDist)
    pronounDist = map(lambda x: pronounFreqDist.freq(x), range(1, RANGE))
    pronounDist.append(1-sum(pronounDist))

    conjunctionFreqDist = FreqDist(conjDist)
    conjunctionDist = map(lambda x: conjunctionFreqDist.freq(x), range(1, RANGE))
    conjunctionDist.append(1-sum(conjunctionDist))

    wordLengthFreqDist= FreqDist(wordLenDist)
    wordLengthDist = map(lambda x: wordLengthFreqDist.freq(x), range(1, RANGE))
    wordLengthDist.append(1-sum(wordLengthDist))

    # calculate readability
    avgSentenceLength = np.mean(sentenceLenDist)
    avgSyllablesPerWord = float(totalSyllables)/totalWords
    readability = float(206.835 - (1.015 * avgSentenceLength) - (84.6 * avgSyllablesPerWord))/100

    wordsFreqDist = MyFreqDist(FreqDist(cleanWords))
    #sentenceDist = FreqDist(sentences)
    #print sentenceDist.keys()[:15] # most common sentences
    #print wordsFreqDist.keys()[:15] # most common words
    #print wordsFreqDist.keys()[-15:] # most UNcommon words

    numUniqueWords = len(wordsFreqDist.keys())
    numTotalWords = len(cleanWords)

    hapax = float(len(wordsFreqDist.hapaxes()))/numUniqueWords # no. words occurring once / total num. UNIQUE words
    dis = float(len(wordsFreqDist.dises()))/numUniqueWords # no. words occurring twice / total num. UNIQUE words
    richness = float(numUniqueWords)/numTotalWords # no. unique words / total num. words

    result = []
    result.append(hapax)
    result.append(dis)
    result.append(richness)
    result.append(readability)
    result.extend(sentenceLengthDist)
    result.extend(wordLengthDist)
    result.extend(pronounDist)
    result.extend(conjunctionDist)

    return result, numTotalWords
Ejemplo n.º 30
0
Archivo: dsc.py Proyecto: dmml/NLTK
    return stem


def lexical_diversity(text):
    return len(text) / len(set(text))


# process for author######
f = open('author.txt', encoding="latin-1")
raw_author = f.read()
author_list = [a for a in (re.split(r'[\t\n]+', raw_author)) if 2 < len(a) < 29]
author_list = nltk.Text(author_list)

fdist_author = FreqDist(author_list)
fdist_author.max()
fdist_author.freq('Vincent Granville')
fdist_author.tabulate(10)
fdist_author.plot(50, cumulative=True)
fdist_author.most_common(10)
popular_author = ['Vincent Granville', 'Michael Walker', 'Mirko Krivanek', 'Don Philip Faithful', 'William Vorhies',
                  'Bernard Marr']
total_author = len(set(author_list))
print("The total number of author in dsc is: " + str(total_author))
avg_post = 1700 / len(set(author_list))
print("Each author post " + str(int(avg_post)) + " blogs in dsc")

# process for text #############
f = open('text.txt', encoding="latin-1")
raw_text = f.read()
# type
type(raw_text)
Ejemplo n.º 31
0
    if args.stop_punctuation:
        stoplist += [x.decode('UTF8') for x in set(list(punctuation))]
        stoplist += [u'\u201d', u'\u201c', u'\u2019', u'\u2014']
        stoplist.append('--')

    words = [word for word in word_tokenize(text) if word not in stoplist]
    if args.stem:
        st = LancasterStemmer()
        words = [st.stem(word) for word in words]

    freq_dist = FreqDist(words)

    print('Total words: ' + str(orig_freq_dist.N()))
    print('Total after filter: ' + str(freq_dist.N()))
    # B() gives list of unique words
    print('Unique words: ' + str(freq_dist.B()))
    print('Unique words ratio: ' +
          str(float(freq_dist.B()) / float(freq_dist.N())))
    print('\n')

    if args.words:
        for word in args.words:
            print(word + ': ' + str(freq_dist[word]))
            print(word + ' freq: ' + str(freq_dist.freq(word)))
            print('\n')

    # Show top 30
    print('Top ' + str(args.num_words) + ' words:')
    freq_dist.tabulate(args.num_words)
Ejemplo n.º 32
0
from nltk import FreqDist
from common.books import text1

fdist = FreqDist(len(w) for w in text1())
print(fdist)
# print(fdist.keys())
# print(fdist.items())

print(fdist.most_common())
print(fdist.max())
print(fdist[3])
print(fdist.freq(3))
Ejemplo n.º 33
0
class Solution1:
    def __init__(self, dictionary_file, training_file):
        self.dictionary = self.read_json_file(dictionary_file)
        training_data = self.read_text_file(training_file)
        self.uni_words = None
        self.bi_words = None
        self.uni_words_pos = None
        self.bi_words_pos = None
        self.uni_pos = None
        self.bi_pos = None
        self.train(training_data)

    @staticmethod
    def read_text_file(filename):
        try:
            file = open(filename, 'r')
        except:
            print('Cannot read file ' + filename + '. Check the path',
                  file=sys.stderr)
            sys.exit(1)
        output = []

        for line in file:
            line = line.strip().lower()
            output.append(line)
        return output

    @staticmethod
    def read_json_file(filename):
        try:
            file = open(filename, 'r')
        except:
            print('Cannot read file ' + filename + '. Please check the path',
                  file=sys.stderr)
            sys.exit(1)
        return json.load(file)

    @staticmethod
    def words_sentence(words):
        return ''.join([
            word if word in string.punctuation else ' ' + word
            for word in words
        ]).strip()

    @staticmethod
    def print_translation(title, source, translation):
        print('%s' % title)
        print('%s' % source)
        print('%s' % translation)
        print('\n')

    def train(self, lines):

        uni_words = []
        bi_words = []
        tri_words = []
        uni_words_pos = []
        bi_words_pos = []
        uni_pos = []
        bi_pos = []

        for line in lines:
            words = word_tokenize(line)
            words_pos = pos_tag(words)
            pos = [word[1] for word in words_pos]
            uni_words = uni_words + ['<s>'] + words + ['</s>']
            uni_words_pos = uni_words_pos + words_pos
            uni_pos = uni_pos + ['<s>'] + pos + ['</s>']
            bi_words = bi_words + list(
                ngrams(words,
                       2,
                       pad_left=True,
                       pad_right=True,
                       left_pad_symbol='<s>',
                       right_pad_symbol='</s>'))
            bi_words_pos = bi_words_pos + list(
                ngrams(words_pos,
                       2,
                       pad_left=True,
                       pad_right=True,
                       left_pad_symbol='<s>',
                       right_pad_symbol='</s>'))
            bi_pos = bi_pos + list(
                ngrams(words_pos,
                       2,
                       pad_left=True,
                       pad_right=True,
                       left_pad_symbol='<s>',
                       right_pad_symbol='</s>'))

        self.uni_words = FreqDist(uni_words)
        self.bi_words = FreqDist(bi_words)
        self.tri_words = FreqDist(tri_words)
        self.uni_words_pos = FreqDist(uni_words_pos)
        self.bi_words_pos = FreqDist(bi_words_pos)
        self.uni_pos = FreqDist(uni_pos)
        self.bi_pos = FreqDist(bi_pos)

    def bigram_words_probability(self, words):
        probability = 0
        vocabulary_size = len(self.uni_words)
        bigrams = list(
            ngrams(words,
                   2,
                   pad_left=True,
                   pad_right=True,
                   left_pad_symbol='<s>',
                   right_pad_symbol='</s>'))
        for bigram in bigrams:
            probability += math.log(self.bi_words.freq(bigram) + 1) - math.log(
                self.uni_words.freq(bigram[1]) + vocabulary_size)

        return probability

    def trigram_words_probability(self, words):
        probability = 0
        vocabulary_size = len(self.uni_words)
        trigrams = list(
            ngrams(words,
                   3,
                   pad_left=True,
                   pad_right=True,
                   left_pad_symbol='<s>',
                   right_pad_symbol='</s>'))
        for trigram in trigrams:
            probability += math.log(self.tri_words.freq(trigram) +
                                    1) - math.log(
                                        self.bi_words.freq(trigram[1]) +
                                        vocabulary_size)

        return probability

    def bigram_pos_words_probability(self, words):
        words = pos_tag(words)
        probability = 0
        vocabulary_size = len(self.uni_words_pos)
        bigrams = list(
            ngrams(words,
                   2,
                   pad_left=True,
                   pad_right=True,
                   left_pad_symbol='<s>',
                   right_pad_symbol='</s>'))
        for bigram in bigrams:
            probability += math.log(self.bi_words_pos.freq(bigram) +
                                    1) - math.log(
                                        self.uni_words_pos.freq(bigram[1]) +
                                        vocabulary_size)

        return probability

    def bigram_pos_probability(self, words):
        probability = 0
        vocabulary_size = len(self.uni_pos)
        bigrams = list(ngrams(words, 2))
        for bigram in bigrams:
            probability += math.log(self.bi_pos.freq(bigram) + 1) - math.log(
                self.uni_pos.freq(bigram[1]) + vocabulary_size)

        return probability

    def probability_permutation(self, words, method):
        max_probability = -math.inf
        selected = None
        permutation_count = math.factorial(
            len(words)) if len(words) < 5 else 100
        for _ in range(permutation_count):
            permutation = numpy.random.permutation(words)
            probability = getattr(self, method)(permutation)
            if probability > max_probability:
                max_probability = probability
                selected = permutation

        return selected

    def pos_model(self, words):
        words_pos = [('', '<s>')] + pos_tag(words) + [('', '</s>')]
        length = len(words_pos)

        for index, word in enumerate(words_pos):
            words_window = words_pos[index:index + 4]

            max_probability = -math.inf
            selected = None
            permutations = itertools.permutations(words_window)
            for permutation in permutations:
                pos = [word[1] for word in permutation]
                probability = self.bigram_pos_probability(pos)
                if probability > max_probability:
                    max_probability = probability
                    selected = permutation

            words_pos[index] = selected[0]
            words_pos[index + 1] = selected[1]
            words_pos[index + 2] = selected[2]
            words_pos[index + 3] = selected[3]

            if index == length - 4:
                break
        return [word[0] for word in words_pos]

    def swap_pos(self, words):
        words_pos = pos_tag(words)
        length = len(words_pos)
        for index, word in enumerate(words_pos):
            if (word[1] == 'PRP' or word[1] == 'PRP$' or word[1] == 'JJ') \
                and (words_pos[index + 1][1] == 'VB' or words_pos[index + 1][1] == 'VBD' \
                     or words_pos[index + 1][1] == 'VBG' or words_pos[index + 1][1] == 'VBN' \
                     or words_pos[index + 1][1] == 'VBP' or words_pos[index + 1][1] == 'WP'):
                temp_word = words_pos[index + 1]
                words_pos[index + 1] = words_pos[index]
                words_pos[index] = temp_word
        return [word[0] for word in words_pos]

    def swap_verb_after_noun(self, words):
        words_pos = pos_tag(words)
        length = len(words_pos)
        for index, word in enumerate(words_pos):
            if (word[1] == 'NN' or word[1] == 'NNS' or word[1] == 'NNP' or word[1] == 'NNPS') \
                and (words_pos[index + 1][1] == 'VB' or words_pos[index + 1][1] == 'VBD' \
                     or words_pos[index + 1][1] == 'VBG' or words_pos[index + 1][1] == 'VBN' \
                     or words_pos[index + 1][1] == 'VBP' or words_pos[index + 1][1] == 'VBZ'):
                temp_word = words_pos[index + 1]
                words_pos[index + 1] = words_pos[index]
                words_pos[index] = temp_word
        return [word[0] for word in words_pos]

    def translate(self, line):
        words = word_tokenize(line)
        translated_words = []
        for i, word in enumerate(words):
            if word not in string.punctuation:
                translated_words.append(self.dictionary[word])
            else:
                translated_words.append(word)
        translated_sentence = self.words_sentence(translated_words)
        self.print_translation('Translation with 0 strategy', line,
                               translated_sentence)

        #Swap the nearest adjective with the word after noun
        translated_words = self.swap_pos(translated_words)
        translated_sentence = self.words_sentence(translated_words)
        self.print_translation('Translation after swapping parts of speech',
                               line, translated_sentence)

        #Swap the nearest verb with the word after noun
        translated_words = self.swap_verb_after_noun(translated_words)
        translated_sentence = self.words_sentence(translated_words)
        self.print_translation('Translation after swapping verb with noun',
                               line, translated_sentence)

        #Bigram Language Model
        selected_translation = self.probability_permutation(
            translated_words, 'bigram_words_probability')
        translated_sentence = self.words_sentence(selected_translation)
        self.print_translation('Translation after applying Bigram Model', line,
                               translated_sentence)

        #Trigram Language Model
        selected_translation = self.probability_permutation(
            translated_words, 'trigram_words_probability')
        translated_sentence = self.words_sentence(selected_translation)
        self.print_translation('Translation after applying Trigram Model',
                               line, translated_sentence)

        #Bigram POS Language Model
        selected_translation = self.probability_permutation(
            translated_words, 'bigram_pos_words_probability')
        translated_sentence = self.words_sentence(selected_translation)
        self.print_translation(
            'Translation after applying Bigram and POS Tagging', line,
            translated_sentence)

        #Rearrangement of POS
        selected_translation = self.pos_model(translated_words)
        translated_sentence = self.words_sentence(selected_translation)
        self.print_translation('Translation after POS rearrangement', line,
                               translated_sentence)

    def execute(self, input_file):
        lines = self.read_text_file(input_file)
        for line in lines:
            self.translate(line)
Ejemplo n.º 34
0
# Counting the number of characters in each word in a text
[len(w) for w in text1]

# Collocations are frequent bigrams from words that are not so common as unigrams. 
# This function returns nothing, just prints the collocations to screen
text1.collocations()

# Computing the frequency distribution of word lengths. Returns a dictionary.
fdistWordLength = FreqDist([len(w) for w in text1])

fdistWordLength.keys() # The different word lengths
fdistWordLength.values() # The frequency of each word length
fdistWordLength.items() # Shows both keys and values at the same time

fdist1['the']
fdist1.freq('the') # Frequency of the word ‘the’
fdist1.max()



#### MOVIE REVIEWS ####
import nltk
from nltk.corpus import movie_reviews

movie_reviews.categories()
movie_reviews.fileids('pos')
movie_reviews.fileids('neg')
movie_reviews.words('neg/cv729_10475.txt')
len(movie_reviews.words('neg/cv729_10475.txt'))

documents = [(list(movie_reviews.words(fileid)), category)
Ejemplo n.º 35
0
class DirichletWords(object):
    def initialize_index(self):
        self.word_to_int = {}
        self.int_to_word = {}

    def __init__(self,
                 num_topics,
                 alpha_topic=1.0,
                 alpha_word=1.0,
                 max_tables=50000,
                 sanity_check=False,
                 initialize=False,
                 report_filename="topic_history.txt"):

        self.max_tables = max_tables
        self._alphabet = FreqDist()
        # store all words seen in a list so they are associated with a unique ID.

        self.initialize_index()

        self._words = FreqDist()

        self.alpha_topic = alpha_topic
        self.alpha_word = alpha_word

        self._num_updates = 0
        self._report = None
        if report_filename:
            self._report = open(report_filename, 'w')

        self.num_topics = num_topics
        self._topics = [FreqDist() for x in xrange(num_topics)]

        # the sanity_check flag is for testing only.
        if initialize and sanity_check == True:
            self.deterministic_seed()
        elif initialize:
            self.initialize_topics()

    def deterministic_seed(self):
        ''' if sanity_check = True, this will seed the topics with enough variance
    to evolve but do so in the most basic and deterministic way possible, so a
    user can follow along each step of the algorithm'''

        chars = "abcdefghijklmnopqrstuvwxyz"
        for i in xrange(3):
            word = random.choice(chars)
            self.index(word)
            topic_weights = probability_vector(self.num_topics)
            for k in xrange(self.num_topics):
                self.update_count(word, k, topic_weights[k])

    def initialize_topics(self):
        ''' initializes the topics with some random seed words so that they have
        enough relative bias to evolve when new words are passed in.  '''
        # we are going to create some random string from /dev/urandom. to convert
        # them to a string, we need a translation table that is 256 characters.
        translate_table = (string.letters * 5)[:256]
        # /dev/urandom is technically not as random as /dev/random, but it doesn't
        # block.
        r = open('/dev/urandom')
        # make random 'words' and add them to the topics. they'll never
        # realistically be seen again- which is good since we just want them to
        # seed the bias in the topics.
        for i in xrange(self.num_topics):
            word_length = random.randint(9, 20)
            word = r.read(word_length).translate(translate_table)
            self.index(word)
            topic_weights = probability_vector(self.num_topics)
            for k in xrange(self.num_topics):
                self.update_count(word, k, topic_weights[k])
        r.close()

    def __len__(self):
        return len(self._words)

    def num_words(self):
        return sum(1 for x in self._words if self._words[x] >= 1)

    def as_matrix(self):
        ''' Return a matrix of the probabilities of all words over all topics.
        note that because we are using topic_prob(), this is equivalent to he
        expectation of log beta, ie Elogbeta '''

        #  XXX TODO we should store this on the fly instead of recomputing it
        #  all the time!

        # create a numpy array here because that's what the e_step in streamLDA
        # expects

        num_words = self.num_words()
        print("%i words" % num_words)
        lambda_matrix = n.zeros((self.num_topics, num_words))

        for word_index, word in enumerate(x for x in self._words \
                                          if self._words[x] >= 1):
            topic_weights = [log(self.topic_prob(k, word)) \
                             for k in xrange(self.num_topics)]

            # topic weights for this word-- a column vector.
            lambda_matrix[:, word_index] = topic_weights

        self._num_updates += 1
        if self._report:
            self._report.write("%i %i %i %i\n" % (self._num_updates,
                                                  len(self._alphabet), \
                                                  len(self._words),
                                                  sum(x.B() for x in self._topics)))

        return lambda_matrix

    def forget(self, proportion):

        num_tables = len(self._words)
        number_to_forget = proportion * num_tables
        if num_tables > self.max_tables:
            number_to_forget += (num_tables - self.max_tables)

        # change this to weight lower probability
        tables_to_forget = random.sample(xrange(num_tables), number_to_forget)
        words = self._words.keys()

        self.initialize_index()

        word_id = -1
        for ii in words:
            word_id += 1

            if not word_id in tables_to_forget:
                self.index(ii)
                continue

            count = self._words[ii]
            for jj in self._topics:
                self._topics[jj][ii] = 0
                del self._topics[jj][ii]

            for jj in ii:
                self._chars[jj] -= count
            self._words[ii] = 0
            del self._words[ii]

    def seq_prob(self, word):
        val = 1.0

        # Weighted monkeys at typewriter
        for ii in word:
            # Add in a threshold to make sure we don't have zero probability sequences
            val *= max(self._alphabet.freq(ii), CHAR_SMOOTHING)

        # Normalize
        val /= 2**(len(word))
        return val

    def merge(self, otherlambda, rhot):
        ''' fold the word counts in another DirichletWords object into this
        one, weighted by rhot. assumes self.num_topics is the same for both
        objects. '''

        all_words = self._words.keys() + otherlambda._words.keys()
        distinct_words = list(set(all_words))

        # combines the probabilities, with otherlambda weighted by rho, and
        # generates a new count by combining the number of words in the old
        # (current) lambda with the number in the new. here we essentially take
        # the same steps as update_count but do so explicitly so we can weight the
        # terms appropriately.
        total_words = float(self._words.N() + otherlambda._words.N())

        self_scale = (1.0 - rhot) * total_words / float(self._words.N())
        other_scale = rhot * total_words / float(otherlambda._words.N())

        for word in distinct_words:
            self.index(word)

            # update word counts
            new_val = (self_scale * self._words[word] +
                       other_scale * otherlambda._words[word])
            if new_val >= 1.0:
                self._words[word] = new_val
            else:
                self._words[word] = 0
                del self._words[word]

            # update topic counts
            for topic in xrange(self.num_topics):
                new_val = (self_scale * self._topics[topic][word] +
                           other_scale * otherlambda._topics[topic][word])
                if new_val >= 1.0:
                    self._topics[topic][word] = new_val
                else:
                    self._topics[topic][word] = 0
                    del self._topics[topic][word]

        # update sequence counts
        all_chars = self._alphabet.keys() + otherlambda._alphabet.keys()
        distinct_chars = list(set(all_chars))

        for ii in distinct_chars:
            self._alphabet[ii] = (self_scale * self._alphabet[ii] +
                                  other_scale * otherlambda._alphabet[ii])

    def word_prob(self, word):
        return (self._words[word] + self.alpha_word * self.seq_prob(word)) / \
               (self._words.N() + self.alpha_word)

    def topic_prob(self, topic, word):
        return (self._topics[topic][word] + \
                self.alpha_topic * self.word_prob(word)) / \
                (self._topics[topic].N() + self.alpha_topic)

    def update_count(self, word, topic, count):
        # create an index for the word
        self.index(word)

        # increment the frequency of the word in the specified topic
        self._topics[topic][word] += count
        # also keep a separate frequency count of the number of times this word has
        # appeared, across all documents.
        self._words[word] += count
        # finally, keep track of the appearance of each character.
        # note that this does not assume any particular character set nor limit
        # recognized characters. if words contain punctuation, etc. then they will
        # be counted here.
        for ii in word:
            self._alphabet[ii] += count

    def index(self, word):
        assert not isinstance(word, int)

        if not word in self.word_to_int:
            self.word_to_int[word] = len(self.word_to_int)
            self.int_to_word[self.word_to_int[word]] = word

        return self.word_to_int[word]

    def dictionary(self, word_id):
        assert isinstance(word_id, int)

        return self.int_to_word[word_id]

    def print_probs(self, word):
        print "----------------"
        print word
        for ii in xrange(self.num_topics):
            print ii, self.topic_prob(ii, word)
        print "WORD", self.word_prob(word)
        print "SEQ", self.seq_prob(word)
# dest = '/Users/asif/Sites/pmidx/journals.csv'
# f = open(dest, 'w+')
# f.write(journalsCSV)
# f.close()

# Tokenized titles
tokenized_titles = []
tokenized_titles = [word_tokenize(titles[x]) for x in xrange(0,len(titles))]
tkTitlesList = []
for n in xrange(0,len(tokenized_titles)):
	tkTitlesList = tkTitlesList + tokenized_titles[n]
stops=['a','the','had','.','(',')','and','of',':',',','in','[',']','for','by','--','?','an','\'','\'s','to','on','is','as','from','-','at','can','does','or','but','use','its','with','using','during']
tokenizedTitles = [token.lower() for token in tkTitlesList if token.lower() not in stops]
fdist = FreqDist(tokenizedTitles)
sortedTitleWords = fdist.keys()
sortedTitleProb = [fdist.freq(token) for token in sortedTitleWords]
sortedTitleN = fdist.N()
sortedTitleCounts = [int(prob*sortedTitleN) for prob in sortedTitleProb]
titlesCounter = {}
for x in xrange(0,60):
	titlesCounter[sortedTitleWords[x]] = sortedTitleCounts[x]

# Returns collaborators as a dictionary matrix
def collaborators_matrix(authors):
	coll = {}
	for x in xrange(0,len(authors)):
		if authors[x]:
			for y in xrange(0,len(authors[x])):
				for z in xrange(0,len(authors[x])):
					if authors[x][y] != authors[x][z]:
						if authors[x][y] in coll.keys(): # first author
Ejemplo n.º 37
0
		return ' '.join( tl[tl.index('[') + 1 : tl.index(']')] )
	else:
		return ' '.join( tl[ 0 : 5 ] )
	return

# 3. Unigrams
from nltk import FreqDist
# a. Lowercase the tokens in emma and create a frequency distribution from them. 
# (Do not throw away punctuation.) Store the result in fd1.
fd1 = FreqDist( list( t.lower() for t in emma) )

# b. Set A3b to the count of the word 'town' in fd1.
A3b = fd1['town']

# c. Set A3c to the relative frequency (probability) of the word 'town' in ud.
A3c = fd1.freq('town')

# d. Set A3d to the number of hapaxes in the distribution fd1.
A3d = len( list( x for x in fd1 if fd1[x] == 1 ) )


# 4. When one formats floating-point numbers, one can specify the number of
# digits after the decimal point as follows:
# >>> '{:.4}'.format(1/7)
# >>> '0.1429'

# Write a function print_uni that takes a FreqDist as input and prints a table with 
# three columns: a word, its count, and its relative frequency. It should print the 
# words in alphabetic order. The first column should be 10 characters wide. If a word 
# is more than 10 characters long, truncate it to 10 characters. The second column 
# should be five characters wide, and the relative frequency should be printed with 
# print allwords
for i in range(len(allwords)):
    try:
        y[i] = int((complexity[allwords[i]]))

    except:
        y[i] = 0
# print y
fdist = FreqDist(brown.words())
freqComplex = []
freqSimple = []
x = []
for i in range(len(allwords)):
    x.append([])
for i in range(len(allwords)):
    x[i].append(fdist.freq(allwords[i]))
    if (y[i] == 1):
        freqComplex.append(fdist.freq(allwords[i]))
    else:
        freqSimple.append(fdist.freq(allwords[i]))
    x[i].append(len(allwords[i]))
    x[i].append(synobj.synCount(allwords[i]))
    x[i].append(synobj.len_of_synonyms(allwords[i]))
"""NO OF VOWELS"""
complex_vowels = []
word_weights_complex = []
simple_vowels = []
word_weights_simple = []
"""all vowels 1,consonants 1.5,x&z 4,q 5"""
alpha_weights = {
    'a': 1,
stopwords.remove('i')  # robin talks about himself a lot, let's include that for the sake of comedy


robin = FreqDist(
    word.lower() for word in read_words('robin.txt') if word.lower() not in stopwords
)

# would be more interesting to use lots of other prosam texts as a base, but I've already procrastinated enough...
base = FreqDist(
    word.lower() for word in nltk.corpus.brown.words() if word.lower() not in stopwords
)


# I run it like `python freq_analyse.py | sort -n | tail -n 15`
for k,v in robin.items():
    print('%.10f' % abs(robin.freq(k) - base.freq(k)),k)  # difference in word frequency


# sample result:

# 0.0074393564 well
# 0.0078216064 know
# 0.0088326042 learned
# 0.0103504380 like
# 0.0104640871 feel
# 0.0104676596 education
# 0.0107427367 learning
# 0.0112185385 good
# 0.0121670187 really
# 0.0126582278 i’ve
# 0.0143654005 also
Ejemplo n.º 40
0
#
# First
#
# Here we will determine the relative frequencies of English characters in the text
# Then we will calculate the entropy of the distribution

# here we use the expression list(var_name) to turn our string into a list
# this basically separates each character for us to make it so that it works
# directly in the freqdist function
english_unigram_fdist = FreqDist(list(english_model_content))

english_unigram_entropy = 0.0

# now loop and get the entropy for english unigrams
for unigram in english_unigram_fdist.samples():
    english_unigram_entropy += english_unigram_fdist.freq(unigram) * math.log(english_unigram_fdist.freq(unigram), 2)

english_unigram_entropy = -english_unigram_entropy

print "The English Unigram Entropy is: " + str(english_unigram_entropy)


#
# Second
#
# Here we will determine the relative frequencies of English bigrams in the text
# Then we will calculate the entropy of the bigram distribution

# create a list to store bigrams in
english_model_bigrams = []
Ejemplo n.º 41
0
def detect(request):
    #Entrada de datos
    if request.method == 'POST':
        identificacion=request.POST.get('dni')
        a=request.FILES['document']
        documento=str(a)
        datos_doc=documento.split('.')
        nombre_doc=datos_doc[0]
        tipo_doc=datos_doc[1]
        if tipo_doc=='txt':
            name=request.FILES['document'].read().lower()
            print(datos_doc)
            #mul=set(stopwords.words("spanish"))
            mul=codecs.open('mul.txt', "r", encoding='UTF-8').read()
            remove('muletillas.txt')
            discurso=(name.decode('UTF-8'))
            #Separar muletillas de palabras comunes
            text_completo = wordpunct_tokenize(discurso)
            m = []
            m = [w for w in text_completo if w in mul]
            
            muletillas= codecs.open('muletillas.txt', "a")
            for i in m:
                muletillas.write(i)
                muletillas.write(" ")
                
            muletillas.close()
            #Contabilizar muletillas 
            tokenizador=RegexpTokenizer('\w+|[^\w\s]+')

            corpus = PlaintextCorpusReader(".", 'muletillas.txt',word_tokenizer=tokenizador, encoding='Latin-1')
            
            frecuencia=FreqDist(corpus.words())
            salida=codecs.open("muletillasR.txt","w",encoding="utf-8")
            palabras=[]
            repeticiones=[]
            #Agregar los datos extraidos en un txt para posterior presentacion
            for mc in frecuencia.most_common(): 
                palabra=mc[0]
                frecuencia_absoluta=mc[1]
                frecuencia_relativa=frecuencia.freq(palabra)
                cadena=str(frecuencia_absoluta)+"\t"+str(frecuencia_relativa)+"\t"+palabra  
                
                palabras.append(palabra.upper()) 
                repeticiones.append(frecuencia_absoluta)  
                salida.write(cadena+"\n")
            try:
                collection.insert_one({
                    'identificacion':identificacion,
                    'documento': documento,
                    'discurso':discurso,
                    'muletillas':palabras
                })
            except Exception as e:
                print("Error : ", type(e), e)
            #Enviado de datos al front
            context={
                'documento': nombre_doc,
                'muletillas':palabras[0:10],
                'repeticiones': repeticiones[0:10]
            }
            return render(request, 'responde.html', context)
        else :
            messages.warning(request, "Verifique el tipo de archivo", extra_tags='file')
            return render(request, 'home.html')
    return render(request, 'home.html')





# class LineChartJSONView(BaseLineChartView):
#     def get_labels():
#         """Return 7 labels for the x-axis."""
#         return ["January", "February", "March", "April", "May", "June","July", "August", "September", "October"]

#     def get_providers(self):
#         """Return names of datasets."""
#         return ["Repeticiones"]

#     def get_data(self):
#         """Return 3 datasets to plot."""

#         return [[75, 44, 92, 11, 44, 95, 35, 11, 44, 95, 35]]


# line_chart = TemplateView.as_view(template_name='responde.html')
# line_chart_json = LineChartJSONView.as_view()
class Solution1:
    """
    Class that implements Direct Machine Translation, as required by the Problem 1
    of the assignment
    """
    def __init__(self, dictionary_file, training_file):
        """
        Initialize the class instance
        :param dictionary_file: The JSON file of closed dictionary
        :param training_file: Training target language file (English file)
        """
        # Read dictionary file
        self.dictionary = self.read_json_file(dictionary_file)
        # Read training file
        training_data = self.read_text_file(training_file)

        # Declare langauge model attributes
        self.unigram_words = None
        self.bigram_words = None
        self.unigram_pos_words = None
        self.bigram_pos_words = None
        self.unigram_pos = None
        self.bigram_pos = None

        # Prepare the language model
        self.train(training_data)

    @staticmethod
    def read_text_file(filename):
        """
        Read the text file
        :param filename: Filename of the text file
        :return: list of lines of the text file
        """
        try:
            file = open(filename, 'r')
        except IOError as e:
            print('Cannot read file ' + filename + '. Please check the path',
                  file=sys.stderr)
            print('I/O error({0}): {1}'.format(e.errno, e.strerror),
                  file=sys.stderr)
            sys.exit(1)
        output = []

        for line in file:
            line = line.strip().lower()
            output.append(line)
        return output

    @staticmethod
    def read_json_file(filename):
        """
        Read a json file
        :param filename: filename of the json file
        :return: dictionary object of json
        """
        try:
            file = open(filename, 'r')
        except IOError as e:
            print('Cannot read file ' + filename + '. Please check the path',
                  file=sys.stderr)
            print('I/O error({0}): {1}'.format(e.errno, e.strerror),
                  file=sys.stderr)
            sys.exit(1)
        return json.load(file)

    """
    # Google translation, not used
    @staticmethod
    def prepare_dictionary(lines, srclang, targetlang):
        words = []
        for line in lines:
            line = line.strip().lower()
            words = words + word_tokenize(line)
        words = map(lambda word: word.lower(), words)
        words = set(words)
        
        output = dict()
        translate_client = translate.Client()
        for word in words:
            output[word] = translate_client.translate(word, targetlang, source_language=srclang)
            output[word] = output[word]['translatedText']
        return output
    """

    @staticmethod
    def words_to_sentence(words):
        return ''.join([
            word if word in string.punctuation else ' ' + word
            for word in words
        ]).strip()

    @staticmethod
    def fix_determiners(words):
        """
        Fix "A", "An", "The" determiners
        :param words: input words
        :return: fixed words
        """

        words_pos = pos_tag(words)
        # Indexes of words to remove
        indices_to_remove = []
        length = len(words_pos)
        for index, word in enumerate(words_pos):
            if word[1] == 'DT':
                # Determiner before pronouns
                if words_pos[index +
                             1][1] == 'PRP' or words_pos[index +
                                                         1][1] == 'PRP$':
                    indices_to_remove.append(index)
                # Replace "A" with "An"
                elif word[0] == 'a' and words_pos[index + 1][0].startswith(
                    ('a', 'e', 'i', 'o', 'u')):
                    words_pos[index] = ('an', words_pos[index][1])
            if index == length - 2:
                break

        # Remove words
        if len(indices_to_remove) > 0:
            for index in indices_to_remove:
                words_pos.pop(index)

        return [word[0] for word in words_pos]

    @staticmethod
    def remove_consecutive_prp(words):
        """
        Remove consecutive pronouns
        :param words: input words
        :return: fixed words
        """
        words_pos = pos_tag(words)
        indices_to_remove = []
        length = len(words_pos)
        for index, word in enumerate(words_pos):
            # Identify consecutive pronouns
            if word[1] in ('PRP',
                           'PRP$') and words_pos[index + 1][1] in ('PRP',
                                                                   'PRP$'):
                indices_to_remove.append(index)
            if index == length - 2:
                break

        # Remove words
        if len(indices_to_remove) > 0:
            for index in indices_to_remove:
                words_pos.pop(index)

        return [word[0] for word in words_pos]

    @staticmethod
    def swap_verb_prp(words):
        """
        Swap reverse ordered verb and noun/pronoun
        :param words: input words
        :return: fixed words
        """
        words_pos = pos_tag(words)
        length = len(words_pos)
        for index, word in enumerate(words_pos):
            # Identify consecutive pronouns
            if word[1] in ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ') \
                    and words_pos[index + 1][1] in ('PRP', 'PRP$', 'NN', 'NNS', 'NNP', 'NNPS'):
                words_pos[index] = words_pos[index + 1]
                words_pos[index + 1] = word
            if index == length - 2:
                break

        return [word[0] for word in words_pos]

    @staticmethod
    def print_translation(translations, output_file):
        """
        Print and write the translation to the output file
        :param translations: Translations output list
        :param output_file: Output file instance
        """
        print(
            '------------------------------------------------------------------------------------------------------'
        )
        print(
            '------------------------------------------------------------------------------------------------------',
            file=output_file)
        for translation in translations:
            print('\033[1m%s:\033[0m\n%s\n' % translation)
            print('%s:\n%s\n' % translation, file=output_file)
        print(
            '------------------------------------------------------------------------------------------------------'
        )
        print(
            '------------------------------------------------------------------------------------------------------',
            file=output_file)

    def train(self, lines):
        """
        Training unigram, bigram, unigram with pos and bigram with pos models
        :param lines: Training lines
        """
        unigram_words = []
        bigram_words = []
        unigram_pos_words = []
        bigram_pos_words = []
        unigram_pos = []
        bigram_pos = []

        for line in lines:
            # Prepare word tokens
            words = word_tokenize(line)
            # Tag the tokens with POS
            words_pos = pos_tag(words)
            # Generate POS sequences
            pos = [word[1] for word in words_pos]

            # Prepare unigram lists with beginnging and end of sentences
            unigram_words = unigram_words + ['<s>'] + words + ['</s>']
            unigram_pos_words = unigram_pos_words + words_pos
            unigram_pos = unigram_pos + ['<s>'] + pos + ['</s>']

            # Prepare bigram lists for words, words_pos and pos
            bigram_words = bigram_words + list(
                ngrams(words,
                       2,
                       pad_left=True,
                       pad_right=True,
                       left_pad_symbol='<s>',
                       right_pad_symbol='</s>'))
            bigram_pos_words = bigram_pos_words + list(
                ngrams(words_pos,
                       2,
                       pad_left=True,
                       pad_right=True,
                       left_pad_symbol='<s>',
                       right_pad_symbol='</s>'))
            bigram_pos = bigram_pos + list(
                ngrams(words_pos,
                       2,
                       pad_left=True,
                       pad_right=True,
                       left_pad_symbol='<s>',
                       right_pad_symbol='</s>'))

        # Generate frequency distribution of all lists
        self.unigram_words = FreqDist(unigram_words)
        self.bigram_words = FreqDist(bigram_words)
        self.unigram_pos_words = FreqDist(unigram_pos_words)
        self.bigram_pos_words = FreqDist(bigram_pos_words)
        self.unigram_pos = FreqDist(unigram_pos)
        self.bigram_pos = FreqDist(bigram_pos)

    def get_bigram_words_probability(self, words):
        """
        Calculate and returns bigram probability of the given arrangement of words
        :param words: Words list
        :return: Probability
        """

        probability = 0
        # Get vocabulary size
        vocabulary_size = len(self.unigram_words)
        # Generate bigrams
        bigrams = list(
            ngrams(words,
                   2,
                   pad_left=True,
                   pad_right=True,
                   left_pad_symbol='<s>',
                   right_pad_symbol='</s>'))
        # Calculate log probability with add-one smoothing
        for bigram in bigrams:
            probability += math.log(self.bigram_words.freq(bigram) +
                                    1) - math.log(
                                        self.unigram_words.freq(bigram[1]) +
                                        vocabulary_size)

        return probability

    def get_bigram_pos_words_probability(self, words):
        """
        Calculates and returns bigram probability of the given arrangement of words with POS
        :param words: Words list POS tagged
        :return: Probability
        """
        # POS tag input words
        words = pos_tag(words)
        probability = 0
        # Get vocabulary size
        vocabulary_size = len(self.unigram_pos_words)
        # Generate bigrams
        bigrams = list(
            ngrams(words,
                   2,
                   pad_left=True,
                   pad_right=True,
                   left_pad_symbol='<s>',
                   right_pad_symbol='</s>'))
        # Calculate log probability with add-one smoothing
        for bigram in bigrams:
            probability += math.log(
                self.bigram_pos_words.freq(bigram) + 1) - math.log(
                    self.unigram_pos_words.freq(bigram[1]) + vocabulary_size)

        return probability

    def get_bigram_pos_probability(self, tags):
        """
        Calculates and returns bigram probabilty of given arrangments of POS tags
        :param tags: Arrangement of POS tags
        :return: Probabilty
        """
        probability = 0
        # Get vocabulary size
        vocabulary_size = len(self.unigram_pos)
        # Generate bigrams
        bigrams = list(ngrams(tags, 2))
        # Calculate log probability with add-one smoothing
        for bigram in bigrams:
            probability += math.log(self.bigram_pos.freq(bigram) +
                                    1) - math.log(
                                        self.unigram_pos.freq(bigram[1]) +
                                        vocabulary_size)

        return probability

    def get_highest_probability_permutation(self, words, method):
        """
        Implementation of argmax. Returns highest probability entry from the list
        words.
        :param words: List of list of words
        :param method: Method to calculate probability
        :return: Highest probability words arrangement
        """

        max_probability = -math.inf
        selected = None
        # Get permutation counts. If the sentence is big, limit to 100
        permutation_count = math.factorial(
            len(words)) if len(words) < 5 else 100
        for _ in range(permutation_count):
            # Generate random permutation
            permutation = numpy.random.permutation(words)
            # Get probability of the permutation
            probability = getattr(self, method)(permutation)
            # Select the permutation with higher probability
            if probability > max_probability:
                max_probability = probability
                selected = permutation

        return selected

    def get_arrangement_with_pos_model(self, words):
        """
        Returns arrangement of words with highest probability using POS ordering
        :param words: Input list of words
        :return: Arrangement of words with higest probability ordering
        """

        # Tag words with POS
        words_pos = [('', '<s>')] + pos_tag(words) + [('', '</s>')]
        length = len(words_pos)

        for index, word in enumerate(words_pos):
            # Pick 4 words window
            words_window = words_pos[index:index + 4]

            max_probability = -math.inf
            selected = None
            # Generate all permutations of the window
            permutations = itertools.permutations(words_window)
            for permutation in permutations:
                # Get all POS tags
                pos = [word[1] for word in permutation]
                # Get the probability
                probability = self.get_bigram_pos_probability(pos)
                # Pick the arrangement with the highest probability
                if probability > max_probability:
                    max_probability = probability
                    selected = permutation

            # Apply the arrangment to the original list of words
            words_pos[index] = selected[0]
            words_pos[index + 1] = selected[1]
            words_pos[index + 2] = selected[2]
            words_pos[index + 3] = selected[3]

            if index == length - 4:
                break

        # Return the list of rearranged words
        return [word[0] for word in words_pos]

    def translate(self, source_sentence, original_translation, output_file):
        """
        Perform translation of the given line
        :param source_sentence: Line of input file
        :param original_translation: Original translation from the test data
        :param output_file: File to write output to
        """

        # Get word tokens
        words = word_tokenize(source_sentence)
        translated_words = []
        # Perform direct machine translation using dictionary
        for i, word in enumerate(words):
            # Skip translating punctuations
            if word not in string.punctuation:
                translated_words.append(self.dictionary[word])
            else:
                translated_words.append(word)

        output = list()
        output.append(('Source Sentence', source_sentence))
        output.append(('Original Translation', original_translation))

        # Normal translation output
        translated_sentence = self.words_to_sentence(translated_words)
        output.append(('Direct Machine Translation', translated_sentence))

        # Improvement 1: Fixing determiners
        translated_words = self.fix_determiners(translated_words)
        translated_sentence = self.words_to_sentence(translated_words)
        output.append(('Fixing determiners', translated_sentence))

        # Improvement 2: Removing consecutive pronouns
        translated_words = self.remove_consecutive_prp(translated_words)
        translated_sentence = self.words_to_sentence(translated_words)
        output.append(('Removing consecutive pronouns', translated_sentence))

        # Improvement 3: Swapping reverse orders verbs and noun/pronouns
        translated_words = self.swap_verb_prp(translated_words)
        translated_sentence = self.words_to_sentence(translated_words)
        output.append(('Swapping reverse orders verbs and noun/pronouns',
                       translated_sentence))

        # Improvement 4: Bigram Language Model
        translated_words = self.get_highest_probability_permutation(
            translated_words, 'get_bigram_words_probability')
        translated_sentence = self.words_to_sentence(translated_words)
        output.append(('Bigram Language Model', translated_sentence))

        # Improvement 5: Bigram POS Language Model
        translated_words = self.get_highest_probability_permutation(
            translated_words, 'get_bigram_pos_words_probability')
        translated_sentence = self.words_to_sentence(translated_words)
        output.append(('Bigram POS Language Model', translated_sentence))

        # Improvement 6: Rearrangement of POS
        translated_words = self.get_arrangement_with_pos_model(
            translated_words)
        translated_sentence = self.words_to_sentence(translated_words)
        output.append(('Rearrangement of POS', translated_sentence))

        self.print_translation(output, output_file)

    def execute(self, input_file, translation_file, output_file):
        """
        Execute the tests on given input file
        :param input_file: Input file
        :param translation_file: File containing original translations
        :param output_file: File to write output to
        """
        input_lines = self.read_text_file(input_file)
        # Open output file for writing
        try:
            output_file = open(output_file, 'w')
        except IOError as e:
            print('Cannot open file' + output_file + ' for writing',
                  file=sys.stderr)
            print('I/O error({0}): {1}'.format(e.errno, e.strerror),
                  file=sys.stderr)
            sys.exit(1)

        original_translation_lines = self.read_text_file(translation_file)
        for index, line in enumerate(input_lines):
            # Translate each line
            self.translate(line, original_translation_lines[index],
                           output_file)
Ejemplo n.º 43
0
# # The following code is deprecated
for text in gutenberg.fileids():
    for word in gutenberg.words(text):
        fd[word] += 1
        # fd.inc(word)  # deprecated. superseded by the line above

# Initialize two empty lists which will hold our ranks and frequencies
ranks = []
freqs = []

# Generate a (rank, frequency) point for each counted token and
# and append to the respective lists, Note that the iteration
# over fd is automatically sorted.
for rank, word in enumerate(fd):
    ranks.append(rank+1)
    freqs.append(fd.freq(word))
word

# Plot rank vs frequency on a log-log plot
plt.loglog(ranks, freqs)
plt.ylabel('frequency(f)', fontsize=14, fontweight='bold')
plt.xlabel('rank(r)', fontsize=14, fontweight='bold')
plt.grid(True)
plt.show()
plt.close()
###############################################################################

###############################################################################
############################## PREDICTING WORDS ###############################
###############################################################################
### PREDICTING WORDS
Ejemplo n.º 44
0
# For each column - except 'tokenized_text'
for col in (train_bigram_columns + features):
    # Convert all values to boolean
    train_X[col] = train_X[col] > 0
    # Add columns and labels in one dataframe
    temp_df = pd.DataFrame()
    temp_df[col] = train_X[col]
    temp_df["label"] = train_y

    # For attribute 'col' equal to 0
    # Get the respective lines
    df0 = temp_df[temp_df[col] == 0]
    # Count occurences of each class(positive/negative/neutral)
    freqdist0 = FreqDist(df0["label"])
    # Get probability of each class
    probabilities0 = [freqdist0.freq(label) for label in freqdist0]
    # Calculate cross entropy of X=0
    Hc0 = -sum(prob * math.log(prob, 2) for prob in probabilities0)

    # For attribute 'col' equal to 1
    # Get the respective lines
    df1 = temp_df[temp_df[col] == 1]
    # Count occurences of each class(positive/negative/neutral)
    freqdist1 = FreqDist(df1["label"])
    # Get probability of each class
    probabilities1 = [freqdist1.freq(label) for label in freqdist1]
    # Calculate cross entropy of X=1
    Hc1 = -sum(prob * math.log(prob, 2) for prob in probabilities1)

    # Caclulate probabilities for each value of 'col' (0/1)
    freqdist = FreqDist(temp_df[col])
Ejemplo n.º 45
0
#!/usr/bin/python
# coding: utf-8

# 2013/03/20

from nltk import FreqDist

fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成
fdist.inc(sample) # sampleで指定されたデータの数を1増やす
fdist['データ'] # 指定されたデータの出現数
fdist.freq('データ') # 指定されたデータの頻度
fdist.N() # サンプルの総数
fdist.keys() # 頻度の順にソートされたサンプル
for sample in fdist: # 頻度の順にサンプルをイテレート
    pass
fdist.max() # 数の最も多いサンプル
fdist.tabulate() # 頻度分布を表形式で表示
fdist.plot() # 頻度分布をプロット
fdist.plot(cumulative=True) # 累積頻度をプロット
fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト


Ejemplo n.º 46
0
bigramsText1 = bigrams(
    text1)  # bigramsText1[0] is the tuple containing the first bigram

# Collocations are frequent bigrams from words that are not so common as unigrams.
# This function returns nothing, just prints the collocations to screen
text1.collocations()

# Computing the frequency distribution of word lengths. Returns a dictionary.
fdistWordLength = FreqDist([len(w) for w in text1])

fdistWordLength.keys()  # The different word lengths
fdistWordLength.values()  # The frequency of each word length
fdistWordLength.items()  # Shows both keys and values at the same time

fdist1['the']
fdist1.freq('the')  # Frequency of the word ‘the’
fdist1.max()

# String methods

s = "MatTias"

s.lower()

s.upper()

s.startswith("ma")

"T" in s

# Find all the words in Moby Dick that ends with -ableness. Sort then alphabetically.
Ejemplo n.º 47
0
y = np.zeros(len(allwords))
# print allwords
for i in range(len(allwords)):
    try:
        y[i] = int((complexity[allwords[i]]))

    except:
        y[i] = 0
# print y
fdist = FreqDist(brown.words())

x = []
for i in range(len(allwords)):
    x.append([])
for i in range(len(allwords)):
    x[i].append(fdist.freq(allwords[i]))
    x[i].append(len(allwords[i]))
    x[i].append(synobj.synCount(allwords[i]))
    x[i].append(ww.wdweight(allwords[i]))
    x[i].append(vc.vCount(allwords[i]))
    x[i].append(synobj.len_of_synonyms(allwords[i]))

classifier = RandomForestClassifier()
classify = classifier.fit((x[0:int(len(x) * 0.8)]), y[0:int(len(y) * .8)])
ypred = classifier.predict(XTest)
# print y[0:int (len(y)*.5)]
a = []
b = []

for i in range(len(ypred)):
    if ypred[i] == 1: