def BootstrapFD(samp):
    fd = FreqDist(samp)
    f1 = float(fd.Nr(1))
    f2 = float(fd.Nr(2))
    N = float(fd.N())
    B = fd.B()
    # Undetected species & Coverage
    if f2 > 0.0:
        f0 = ceil(((N - 1.0) / N) * (f1 ** 2.0) / (2.0 * f2))
        C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0 * f2)
    else:
        f0 = ceil(((N - 1.0) / N) * f1 * (f1 - 1.0) / 2.0)
        C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0)
        # Correct abundances
    probs = array(fd.values()) / N
    lambdah = (1 - C) / sum(probs * (1 - probs) ** N)
    probs = probs * (1 - lambdah * (1 - probs) ** N)
    # P for unseen
    # paux = (1-C)/f0
    yield fd.values()
    popO = arange(B)
    dist = binom(n=N, p=1 - C)
    probsA = probs / sum(probs)
    while True:
        ns2 = dist.rvs()
        ns1 = int(N) - ns2
        if ns1 > 0:
            samp1 = list(choice(popO, size=ns1, replace=True, p=probsA))
        else:
            samp2 = []
        if ns2 > 0:
            samp2 = list(random_integers(B, B + int(f0) - 1, ns2))
        else:
            samp2 = []
        yield FreqDist(samp1 + samp2).values()
def lesk_ratio(t, s, G=nx.Graph()):
    # LESK feita sobre o G para a aplicação mais simples. Estou testando para ver se funciona do jeito mais simples
    t_def = FreqDist(G.nodes(data='ext_gloss')[t].split())
    s_def = FreqDist(G.nodes(data='ext_gloss')[s].split())
    intersection = (t_def) & (s_def)
    value = 0
    for i in intersection:
        value += t_def[i] * s_def[i]
    total = sum(t_def.values()) + sum(s_def.values())
    return value/total
Example #3
0
def stat_freq(text):
    words = common.tokenizing(text)
    freq_dist = FreqDist(words)
    freq_list = []
    num_words = len(freq_dist.values())
    for i in range(num_words):
        freq_list.append(
            [list(freq_dist.keys())[i],
             list(freq_dist.values())[i]])
    freq_arr = np.array(freq_list)
    return freq_arr
Example #4
0
 def extract_most_common_words(self, words, sentiment):
     word_freq = FreqDist(words)
     print("for the sentiment", sentiment)
     print("there are", len(word_freq.keys()), "different words")
     print("that were used", sum(word_freq.values()), "times")
     df = pd.DataFrame({
         f'{sentiment}_words': list(word_freq.keys()),
         f'{sentiment}_counts': list(word_freq.values())
     })
     df = df.nlargest(self.n_words, columns=f'{sentiment}_counts')
     df.reset_index(drop=True, inplace=True)
     return df, len(word_freq.keys()), sum(word_freq.values())
Example #5
0
def append_terms(doc, terms, data, minterm_doc,vector):

	tf = FreqDist(terms)
	max_tf = max(tf.values()if len(tf)>0 else [0])

	for term in tf.keys():
		normalize_tf = tf[term] / max_tf
		new_doc = {'tf': normalize_tf,
				   'weight': 0,
				   'minterm':minterm_doc}
		in_data = False
		for term_data in data:
			if term == term_data['key']:

				#update
				term_data['value']['documents'][doc] = new_doc
				in_data = True
				break

		if not in_data:
			# add
			data.append({'key': term,
						 'value': {'idf':0,
									'documents': {doc:new_doc},
									'index_in_vector': vector[term]}})
Example #6
0
def freq_words(x, terms = 30):
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()

    fdist = FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
    return words_df
    def getLongTermsRanked(self,
                           minLen=7.0,
                           numberMostCommons=30,
                           display=False):
        result = []
        resultDocuments = {}
        for seoDocument in self.seoLibrary.seoDocuments:
            tokenList = list(
                set(
                    seoDocument.getTextTokens(removeSplitter=True,
                                              lemmatize=True)))
            for token in tokenList:
                if len(token) > minLen:
                    result.append(token)
                    if token not in resultDocuments:
                        resultDocuments[token] = [seoDocument.order]
                    else:
                        resultDocuments[token].append(seoDocument.order)

        fdist = FreqDist(result)

        for token in fdist.keys():
            fdist[token] = fdist[token] * self.getRankingModifier(
                numpy.mean(resultDocuments[token])) * self.getLengthModifier(
                    len(token), minLen)

        maxValue = max(fdist.values())

        return [(word, int(metric * 100.00 / maxValue))
                for word, metric in fdist.most_common(numberMostCommons)]
class BrownDataset(object):
    def __init__(self, include_start=True):
        self.words = brown.words()
        self.words = map(lambda x: x.lower(), self.words)
        self.total_word_cnt = len(
            self.words) + 2 * len(brown.sents())  # include START and END
        if include_start:
            self.words.append(u'START')
        self.words.append(u'END')
        self.vocab = set(self.words)

        self.vocab_len = len(self.vocab)
        self.word_to_idx = dict(zip(list(self.vocab), range(self.vocab_len)))

        self.sentences = []
        self.bigrams = []
        self.unigrams = []
        for sent in brown.sents():
            sentence = map(lambda x: x.lower(), sent)
            if include_start:
                sentence.insert(0, u'START')
            sentence.append(u'END')
            self.sentences.append(sentence)
            self.bigrams.extend(list(ngrams(sentence, 2)))
            self.unigrams.extend(sentence)

        self.unigram_freq = dict(Counter(self.unigrams))

        self.num_sentences = len(self.sentences)
        self.bigram_cnt = FreqDist(self.bigrams)
        self.bigram_len = len(self.bigram_cnt)
        self.bigram_idx = dict(
            zip(self.bigram_cnt.keys(), range(self.bigram_len)))
        self.bigram_freq = np.asarray(self.bigram_cnt.values())
        self.num_bigrams = len(self.bigram_cnt)
Example #9
0
def frequent_words(x, terms=30):
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()
    freq_dist = FreqDist(all_words)
    x = transformer.transform(
        word.replace("_", " ") for word in freq_dist.keys())
    words_df = pd.DataFrame({
        'word': list(freq_dist.keys()),
        'count': list(freq_dist.values()),
        'vector': list(x)
    })
    good = []
    bad = []
    for i in range(1, len(words_df)):
        if (nb.predict(words_df.at[i, 'vector']) == 5):
            good.append([
                words_df.at[i, 'count'], words_df.at[i,
                                                     'word'].replace(" ", "_")
            ])
        else:
            bad.append([
                words_df.at[i, 'count'], words_df.at[i,
                                                     'word'].replace(" ", "_")
            ])
    good = sorted(good, key=lambda x: x[0], reverse=True)
    bad = sorted(bad, key=lambda x: x[0], reverse=True)
    return format_result(good, bad, terms)
Example #10
0
def texts_features(paras_list,
                   sents_list,
                   words_list,
                   len_w_big=7,
                   print_results=False):

    num_paras = len(paras_list)
    num_sents = len(sents_list)
    sperp = num_sents / num_paras
    tokens = FreqDist(words_list)
    words_count = sum(tokens.values())
    vocab = len(tokens)
    lexdiv = words_count / vocab
    words_big = [word for word in words_list if len(word) > len_w_big]
    w_big_num = len(words_big)

    # flash_k_ind = 0.4*(0.78*words_count/num_sents + 100*w_big_num/words_count)
    flash_k_ind = 0.4 * (words_count / num_sents +
                         100 * w_big_num / words_count)

    if print_results:
        print(
            ("Текст сформирован из {} параграфов и {} предложений.\n"
             "{:0.3f} предложений на параграф \n"
             "Всего слов {} при словаре из {} уникальных слов\n"
             "Лексическое разнообразие -{:0.3f}\n"
             "Индекс туманности - {:0.3f}\n").format(num_paras, num_sents,
                                                     sperp, words_count, vocab,
                                                     lexdiv, flash_k_ind))

    statsdict={'num_paras':num_paras,'num_sents':num_sents,\
               'sperp':sperp,'words_count':words_count,'vocab':vocab,'words{}+'.format(len_w_big):w_big_num,'lexdiv':lexdiv,'flash_k_ind':flash_k_ind}

    return statsdict
def get_common_values(cloud, feature, position=None, possible_values=None):
    """Return the most common values of the feature in the cloud."""
    if feature in all_features:
        # Get all the values of the specified feature at the specified position
        # in the cloud provided with the restrictions provided.
        value_list = get_all_values(cloud, feature, position, possible_values)
        f_type = feature_type(feature)
        # Get common values for categorical features.
        if f_type == 'categorical':
            # Find all values that are tied for the most frequent and return
            # them.
            value_counts = FreqDist(value_list)
            top_values = {v for v in value_counts
                          if value_counts[v] ==\
                          max(value_counts.values())}
            return top_values
        # Get common values for continuous features.
        elif f_type == 'continuous':
            # Return the average of the feature values.
            if len(value_list) > 0:
                return {mean(value_list)}
            else:
                return set()
    else:
        raise FeatureNotFoundError(feature)
Example #12
0
def plot_postives(s=0, e=50):
    all_words = ' '.join([text for text in df['review']])
    all_words = all_words.split()
    fdist = FreqDist(all_words)
    words_df = pd.DataFrame({
        'word': list(fdist.keys()),
        'count': list(fdist.values())
    })

    # selecting top 20 most frequent words
    d = words_df.nlargest(columns="count", n=len(df['review']))
    d.reset_index(inplace=True)

    d['pos_perc'] = np.nan
    for tag in d['word'].values:
        ret = df[df['review'].str.contains(tag)]
        pos_perc = ret[ret['prediction'] ==
                       'pos'].shape[0] / ret.shape[0] * 100
        neg_perc = 100 - pos_perc
        d.loc[(d['word'] == tag), 'pos_perc'] = pos_perc
    d = d.sort_values('pos_perc', ascending=False)
    plt.figure(figsize=(20, 5))
    sns.barplot(data=d[s:e], x='word', y='pos_perc')
    if (e - s > 60):
        plt.xticks(rotation=90)
    else:
        plt.xticks(rotation=45)
    plt.xticks()
    plt.title('Percentage of Positive Reviews per tag.')
    plt.show()
Example #13
0
def analysis(dataset, topic_list):
    '''
        start with some data analysis on Review Text and Review Title
        applying the bag of words approach first
    '''
    # remove stopwords and punctions and symbols
    dataset['Review Text'] = dataset['Review Text'].str.replace(
        "[^a-zA-Z#]", " ")
    # remove short words (length < 3)
    dataset['Review Text'] = dataset['Review Text'].apply(
        lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 2]))
    all_reviews = [
        remove_stop_words(words.split(" ")) for words in dataset['Review Text']
    ]
    lemmatizer = WordNetLemmatizer()
    all_words = ' '.join([lemmatizer.lemmatize(word)
                          for word in all_reviews]).split()
    '''
        Plotting the top 30 words of highest frequency 
    '''
    freq_dist = FreqDist(all_words)
    words_distribution = pd.DataFrame({
        'word': list(freq_dist.keys()),
        'count': list(freq_dist.values())
    })
    top_words_distribution = words_distribution.nlargest(
        columns='count', n=30)  # want to view top 30 words

    #plot the output
    plt.figure(figsize=(50, 10))
    ax = sns.barplot(data=top_words_distribution, x="word", y="count")
    ax.set(ylabel='Count')
    plt.show()

    return top_words_distribution, dataset
def tf_measure(word_tokens, query_tokens, N):
    tfscore = 0.0

    freq = FreqDist(word_tokens)
    try:
        wf = max(freq.values())
    except:
        wf = 0.0

    for token in query_tokens:
        try:
            tf = freq[token]
            tf = 1.0 + log(tf)
            #tfscore = 0.5 + (0.5 * (0.0 + tf))/(0.0 + wf)
        except:
            tf = 0.0

        tfscore+=tf



    # IDF measures



    #print tfscore
    return tfscore
Example #15
0
 def __extract_most_common_words_by_class(self, list_of_words, class_value):
     word_freq = FreqDist(list_of_words)
     df = pd.DataFrame({
         f'{class_value}_words': list(word_freq.keys()),
         f'{class_value}_counts': list(word_freq.values())
     })
     df = df.nlargest(self.n_words, columns=f'{class_value}_counts')
     df.reset_index(drop=True, inplace=True)
     return df
Example #16
0
def get_probs(filename):
    """read the given text and calculate the probabilities for all symbols."""
    with open(filename) as file_in:
        text = file_in.read()
    probs = FreqDist(text)
    count_sum = sum(v for v in probs.values())
    for k,v in probs.items():
        probs[k] = v * 1.0 / count_sum
    return probs
Example #17
0
def get_probs(filename):
    """read the given text and calculate the probabilities for all symbols."""
    with open(filename) as file_in:
        text = file_in.read()
    probs = FreqDist(text)
    count_sum = sum(v for v in probs.values())
    for k, v in probs.items():
        probs[k] = v * 1.0 / count_sum
    return probs
Example #18
0
def word_frequency_vs_rank(text, as_probability=False):
    frequency_distribution = FreqDist(text)
    frequency = np.asarray(
        sorted(frequency_distribution.values(), reverse=True))
    if as_probability:
        frequency / len(text)

    rank = np.arange(1, len(frequency) + 1)

    return rank, frequency
Example #19
0
def freq_words(x,terms=200):
    all_words=x.split()
    fdist=FreqDist(all_words)
    df=pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
    return (df.nlargest(columns='count', n=terms))
    
    d=df.nlargest(columns='count', n=terms)
    plt.figure(figsize=(15,5))
    ax=sns.barplot(data=d, x="word", y="count")
    ax.set(ylabel="count")
    plt.show()
Example #20
0
def word_frequence(al, rank):
    lst = LancasterStemmer()
    left = [
        lst.stem(word.lower()) for word in word_tokenize(al)
        if word.lower() not in stopwords.words('english') and len(word) > 2
    ]
    final = FreqDist(left)
    sort = sorted(list(set(final.values())))
    sort = [i for i in sort[::-1]]
    for i in sort[:rank]:  #¦C¥X«e´X¦W
        print([v for v, k in final.items() if k == i], i)
Example #21
0
def freq_words(x, terms = 30):
         all_words = ' '.join([text for text in x])
         all_words = all_words.split()
         fdist = FreqDist(all_words)
         words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})

  # selecting top 20 most frequent words
         d = words_df.nlargest(columns="count", n = terms) 
         plt.figure(figsize=(20,5))
         ax = sns.barplot(data=d, x= "word", y = "count")
         ax.set(ylabel = 'Count')
         plt.show()
Example #22
0
def zipf(tokens):
	from nltk import FreqDist            # For frequenct distribution
	p = FreqDist(tokens)                 # Finding frequency distribution of the tokens found above
	freq = list(p.values())

	freq.sort(reverse=True)              # Sort freq reverse to get ranked values

	f = np.array(freq)
	r = np.arange(1, len(f)+1)

	k = np.median(f*r)
	return k
Example #23
0
 def generar_grafico2(self, lista_datos):
     import nltk
     from nltk import FreqDist
     lista_unica = ""
     for respuesta_encuesta in lista_datos:
         for respuesta_pregunta in respuesta_encuesta:
             for palabra in respuesta_pregunta:
                 lista_unica += palabra + " "
     tokens = nltk.word_tokenize(lista_unica)
     fdist = FreqDist(tokens)
     print(fdist.keys())
     print(fdist.values())
     fdist.plot(30, cumulative=False)
Example #24
0
def freq_words(x, terms=30):
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()

    fdist = FreqDist(all_words)
    words_df = pd.DataFrame({
        'word': list(fdist.keys()),
        'count': list(fdist.values())
    })

    # selecting top 20 most frequent words
    d = words_df.nlargest(columns="count", n=terms)
    return (d)
Example #25
0
def freq_words(x, terms=30):
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()
    fdist = FreqDist(all_words)
    words_df = pd.DataFrame({
        'word': list(fdist.keys()),
        'count': list(fdist.values())
    })

    d = words_df.nlargest(columns="count", n=terms)
    plt.figure()
    ax = sns.barplot(data=d, x="count", y="word")
    ax.set(ylabel='word')
Example #26
0
def freq(review, nlarge=30):
    all_sentence = [sent for sent in review]
    all_sentence = " ".join(all_sentence)
    all_words = all_sentence.split()
    word_freq = FreqDist(all_words)
    df_freqdist = pd.DataFrame({
        "word": list(word_freq.keys()),
        "count": list(word_freq.values())
    })
    d = df_freqdist.nlargest(columns="count", n=nlarge)
    plt.figure(figsize=(20, 10))
    # plt.bar(d["word"],d["count"])
    sns.barplot(d["word"], d["count"])
    plt.show()
Example #27
0
    def run(self):

        website = [
            "new+york+times", "bbc+news", "cnn", "daily+mail", "al+jazeera"
        ]
        check = [
            'new york times', "bbc news", "cnn", "daily mail", "aljazeera"
        ]
        my_stopwords = [
            'first', 'could', 'says', 'year', 'years', 'may', 'us', 'set',
            'time', 'new', 'trumps', 'one', 'say', 'times', 'city', 'day',
            'top', 'making', 'make', 'bbc', 'cnn', "two", "news", 'like',
            'wont', 'get', 'run', 'still', 'good', 'dont', 'take', 'days',
            'im', 'gets', 'want', 'go', 'finds', 'goes', 'gets'
        ]
        portion = [7, 11, 11, 4, 4]
        self.month_titles = []
        for site_num in range(5):
            temp_thread = []
            for page in range(portion[site_num]):
                sleep(0.5)
                temp_thread.append(
                    threading.Thread(target=self.extract,
                                     args=(page, website[site_num],
                                           check[site_num])))
                temp_thread[-1].start()
        for i in temp_thread:
            i.join()
        print(len(self.month_titles))
        text = ' '.join([i[0] for i in self.month_titles])
        month_keyword = word_token(text, my_stopwords)
        with open(f"raw_text/{self.year}_{self.month}.csv",
                  'w',
                  encoding='utf-8') as w:
            csv.writer(w).writerow(month_keyword)
        month_keyword, times = word_frequence(month_keyword, 20)
        recommand0 = [
            j[0] + '@' + j[1] for i in month_keyword for j in self.month_titles
            if i.lower() in j[0].lower()
        ]
        recommand0 = FreqDist(recommand0)
        sort_recommand = sorted(list(set(recommand0.values())), reverse=True)
        recommand = [
            k.split('@') for k, v in recommand0.items()
            if v in sort_recommand[:2]
        ]
        if len(recommand) > 15:
            recommand = random.sample(recommand, 15)  #因為會照字母排序導致NYtimes永遠被排除
        final_data.append(
            (self.year, self.month, month_keyword, times, recommand))
Example #28
0
def normalized_top_50(page):
    """Returns the 50 most common words from the page with normalized scores."""
    words_stopped = preprocess(page)

    freqdist = FreqDist(words_stopped)
    top_50 = freqdist.most_common(50)

    total = sum(freqdist.values())

    normalized = []
    for word in top_50:
        normalized_frequency = word[1] / total
        normalized.append((word[0], "{:.4}".format(normalized_frequency)))
    return normalized
Example #29
0
def generate_word_counts_fig(x, terms=30):
    all_words = " ".join([text for text in x])
    all_words = all_words.split()

    fdist = FreqDist(all_words)
    words_df = pd.DataFrame({
        "word": list(fdist.keys()),
        "count": list(fdist.values())
    })

    # selecting top 20 most frequent words
    d = words_df.nlargest(columns="count", n=terms)
    plt.figure(figsize=(20, 5))
    ax = sns.barplot(data=d, x="word", y="count")
    ax.set(ylabel="Count")
Example #30
0
def getFrequencyDistribution(words, num=20):

    fdist = FreqDist(words)
    words_df = pd.DataFrame({
        'word': list(fdist.keys()),
        'count': list(fdist.values())
    })

    # selecting top 'num' most frequent words
    d = words_df.nlargest(columns="count", n=num)
    plt.figure(figsize=(20, 5))
    ax = sns.barplot(data=d, x="word", y="count")
    ax.set(ylabel='Count')
    plt.show()

    return words_df
    def freq_words(self, x):
        all_words = " ".join([text for text in x])
        all_words = all_words.split()
        print("\n all_words in corpus --- ", len(all_words), all_words[:12])
        fdist = FreqDist(all_words)
        # print("\n fdist --- ", fdist)

        words_df = pd.DataFrame({
            'word': list(fdist.keys()),
            'count': list(fdist.values())
        })
        print("\n words counts in corpus --- \n ", words_df.head())

        # selecting top 20 most frequent words
        d = words_df.nlargest(columns='count', n=20)
        print("\n d --- ", d)
Example #32
0
def word_frequence(left, rank):
    final = FreqDist(left)
    sort = sorted(list(set(final.values())), reverse=True)  #次數去重複(例如多個只出現一次)排序
    count = 0
    keyword, times = [], []
    for i in sort[:rank]:  #列出前幾名用次數去找對應字
        key = [k for k, v in final.items() if v == i]
        count += len(key)  #限制最大關鍵字數
        if count > rank:
            break
        if i > 3:  #指挑最熱門的關鍵字
            print(key, i)
            keyword += key
            for y in range(len(key)):
                times.append(i)
    return keyword, times  #裝入前rank關鍵字的串列
Example #33
0
    def _pre_treate(self, records=None):
        MAX_SEQUENCE_LENGTH = self.MAX_SEQUENCE_LENGTH
        MAX_WORDS = self.MAX_WORDS
        MODEL_TYPE = self.MODEL_TYPE
        N_GRAM = self.N_GRAM
        CONFIG_PATH = self.CONFIG_DIR

        if os.path.exists(os.path.join(CONFIG_PATH,'classifier_config.json')):
            word2id, id2label, label2id, class_weight, parameter = json.load(
                open(os.path.join(CONFIG_PATH, 'classifier_config.json'), 'r', encoding='utf-8')
            )
            if parameter['model'] != MODEL_TYPE or parameter['max_length'] != MAX_SEQUENCE_LENGTH:
                raise Exception("classifier_config error: inconsistent model type or sequence length, "
                                "please delete config.json and rerun the program")
            class_weight = {int(index): weight for index, weight in class_weight.items()}
        else:
            try:
                x = list(map(lambda x: x['content'], records))
                y = list(map(lambda x: x['label'], records))

                # create word2id dict
                if MODEL_TYPE != 'FastText':
                    N_GRAM = 1
                vectorizer = CountVectorizer(token_pattern = '[^\s]+', ngram_range=(1,N_GRAM), max_df=0.95, min_df=3, max_features=MAX_WORDS)
                vectorizer.fit(x)
                word2id = {word: str(index)+2 for word, index in vectorizer.vocabulary_.items()}

                # create id2label and label2id dict
                Binarizer = LabelEncoder()
                Binarizer.fit(x)
                id2label = {index: label for index, label in enumerate(Binarizer.classes_)}
                label2id = {label: index for index, label in enumerate(Binarizer.classes_)}

                # create class_weight dict
                label_freq = FreqDist(y)
                class_weight = {int(index): max(label_freq.values())/label_freq[label]
                                for index, label in enumerate(Binarizer.classes_)}

                # store model parameter
                parameter = {'model': MODEL_TYPE, 'max_length': MAX_SEQUENCE_LENGTH, 'ngram': N_GRAM}

                # dump config file
                json.dump([word2id, id2label, label2id, class_weight, parameter],
                          open(os.path.join(CONFIG_PATH, 'classifier_config.json'), 'w', encoding='utf-8'))
            except Exception as e:
                sys.exit('Error: config file not exist')
        return word2id, id2label, label2id, class_weight, parameter
print "building Text format"
text = nltk.Text(tokens)
print "building freqdist."
fdist = FreqDist(text)
print "freqdist done."

# output result to csv file
print "opening csv file."

csvfile = file("/Users/Zhao/Documents/gone_with_the_wind.csv", "aw")
writer = csv.writer(csvfile)
print "writing csv file"

# no repeated item
arr = []
arr.append((1, fdist.keys()[1], fdist.values()[1], 1))
pre_value = fdist.values()[1]
cur_num = 2
for i in xrange(0, 24903):
    # print i+1, fdist.keys()[i], fdist.values()[i]
    if fdist.values()[i] != pre_value:
        if pattern.match(fdist.keys()[i]) != None:
            item = (i + 1, fdist.keys()[i], fdist.values()[i], cur_num)
            pre_value = fdist.values()[i]
            cur_num += 1
            arr.append(item)
    print i + 1, "done."
arr.append((24903, fdist.keys()[-1], fdist.values()[-1], cur_num))

print arr
            newtuple = (new_words[i], t[1]) #Each new tuple uses same POS tag (t[1])
            text.insert(position+i, newtuple)
    position+=1
    
#==============================================================================  

text = [(w,p) for w,p in text if re.match(r"[\'a-z]",w[0])]

nonlemwords = [w for w,p in text]
#==============================================================================
# Create non-lemmatized version to use if the lemmatized version doesn't have matches (because of differences in POS tagging)
#==============================================================================
bigrams = FreqDist(zip(nonlemwords[:-1],nonlemwords[1:]))
unigram = FreqDist(nonlemwords)

sbig = float(sum(bigrams.values()))
suni = float(sum(unigram.values()))

nonlemassoc = {}
for b0,b1 in bigrams:
    p1 = unigram[b0]/suni
    p2 = unigram[b1]/suni
    p12 = bigrams[b0,b1]/sbig
    nonlemassoc[b0,b1] = log(p12)-log(p1)-log(p2) 
#==============================================================================
# #Write SBC Bigram association scores to file
#==============================================================================
f=open("/Users/heathersimpson/Documents/Dissertation/Articles/Chp3_IUvsClauseBoundaries/BigramStrength/SBC-nonlembigrams.txt","w")
#Give it headers first
f.write("Word1\tWord2\tpwMI\n")
f.close()
Example #36
0
# How many times does a word appear in the text?
text1.count("do")

# Percentage of the text occupied by a word, see E28 below for a better function.
from nltk.book import text5 # Chat conversations
100*text5.count("call")/len(text5)
100*text5.count("whatever")/len(text5)

# Frequency distribution
from nltk import FreqDist

fdist1 = FreqDist(text1)

vocabulary = fdist1.keys()

frequencies = fdist1.values()

fdist1['whale'] 

# Define a function that computes lexical diversity
def lexical_diversity(text):
        return len(text)/len(set(text))

#Note that our new function can be used on any text, even your own:
lexical_diversity(myText)

# You can combine two lists with text (the addition operator concatenates strings and lists):
myText1 = ["This", "is", "my","text","and"]

myText2 = ["there","is","nothing","you","can","do","about","it","!"]
def buildcorpus(corpus, rootpath, filelimit = 0):
    
    #rootpath = corpus.rootpath
    fileids = os.listdir(rootpath)
    
    hugewordlist = []   
    hugewordlist.extend(corpus.words)   # will contain distinct Word instances

    numoffiles = 0
    
    corpus.set_corpusname(str(max(filelimit, len(fileids)))+"texts")
    
    for fileid in fileids:
    
        
        allwords = FreqDist()    # will contain all words in this text
        
        doc_id = fileid.split(".")[0]
        # corpus.inserttext(doc_id)    ##### !   text in kendisini gondermeli
        newtext = Text(doc_id)
        
        path = rootpath + os.sep + fileid
        #lines = readtextlines(path)
    
        #rawtext = texter.readtxtfile(path)
        rawtext = texter.readnewstext(path)
        lines = texter.splitToSentences(rawtext)
        
        sntindex = 0
        # each line is a sentence
        for line in lines:
            words = []   # words in this sentence
            words = line.split()
            words = texter.eliminatepunctuation(words)
            words = [word for word in words if not word.isspace()]
            
            
            
            for word in words:
                allwords.inc(word)
                
                
                newword = Word(word)
                newword.insertsentenceid(doc_id+"_"+str(sntindex))
                
                if allwords[word] <= 1:    # if this was not added to the hugelist before, add it
                    hugewordlist.append(newword)
                
                    
            sentence = Sentence(sntindex)
            sntindex = sntindex + 1
            
            # sentence'a Word mu wordindex mi atalim?
            for word in words:
                index = hugewordlist.index(Word(word))
                hugewordlist[index].insertsentenceid(doc_id+"_"+str(sntindex-1))
                sentence.insertword(index)
                
            newtext.insertsentence(sentence)
            
        if (not rawtext.isspace()) or (len(allwords) != 0):   
            corpus.inserttext(newtext)    
            
            print str(numoffiles)," : finished handling the words-snts-txts ",doc_id 
    
                
            numofwords = reduce(lambda x,y : x+y, allwords.values())
            
            for word in hugewordlist:
                cnt =  allwords[word.literal]
                #freq = cnt / float(numofwords)
                word.assigntermfreq(cnt, numofwords, doc_id)
                #hugewordlist[index].toscreen()
        
        numoffiles = numoffiles + 1
        if filelimit == numoffiles:
            break       

        
    # end for - docs
    

    numofdocs = len(fileids)
    print "computing tf*idf"
    for word in hugewordlist:
        word.computeinvdocfreq(numofdocs)
        word.computeTFIDF()
        #word.toscreen()
        
    corpus.assignwords(hugewordlist)
    print "corpus length ",str(len(corpus.words))," words"
    print "huges length ",str(len(hugewordlist))," words"
    print "exiting buildcorpus()"
    
    print "pickle-dumping words"
    corpus.pickledumpwords()
Example #38
0
import matplotlib
import string

exclude = set(string.punctuation)

with open("YT_Comment_Output.txt", "rb") as f:
	lines = [line.rstrip() for line in f]
	splits = [line.split() for line in lines]
	some_upper = [item for sublist in splits for item in sublist]
	#replace BOM w known stopword
	BOM_gone = [word.replace('\xef\xbb\xbf', 'i') for word in some_upper]
	punct_gone = []
	for word in BOM_gone: 		
		punct_gone.append(''.join(ch for ch in word if ch not in exclude))
	YT_comment_words = [word.lower() for word in punct_gone]

with open('stopwords.txt', 'rb') as f:
    stopwords = [line.rstrip() for line in f]

print YT_comment_words[:10]
print stopwords[:10]

filtered_words = [w for w in YT_comment_words if not w in stopwords]

print filtered_words[:10]

fd = FreqDist(filtered_words)
print fd.values()[:10]
print fd
fd.plot(30)
Example #39
0
def get_frequency_distribution(words):
    fd = FreqDist(i.lower() for i in words)
    print(fd)
    sorted_fd = sorted(fd.values(), reverse=True)
    print(sorted_fd[0:10])
    return sorted_fd
Example #40
0
File: demo.py Project: t2y/learnnlp
# -*- coding: utf-8 -*-
from nltk import FreqDist
from nltk.corpus import reuters


yen = reuters.words(categories='yen')
fd1 = FreqDist(i.lower() for i in yen)
sfd1 = sorted(fd1.values(), reverse=True)

# ---

for i, v in enumerate(fd1[0:100], 1): print('%d, %d, %d'  % (i, v, i*v))

# ---

import pylab
pylab.plot(sfd1, color='red')

pylab.xscale('log')
pylab.yscale('log')
pylab.show()

# ---

from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

yen_exclude_stops = [i for i in yen if i.lower() not in english_stopwords]
fd2 = FreqDist(i.lower() for i in yen_exclude_stops)
sfd2 = sorted(fd2.values(), reverse=True)
            words = (word for word in words if word not in nltk.corpus.stopwords.words('english'))

            #a = nltk.word_tokenize(word_list)
            b = nltk.pos_tag(word_list)
            c = nltk.ne_chunk(b,binary=True)
            tokencount = tokencount + len(word_list)
            fdist = FreqDist()
            for x in c.subtrees():
                if x.node == "NE":
                    words = [w[0] for w in x.leaves()]
                    name = " ".join(words)
                    #print name
                    
                    fdist.inc(name)
		    bigfdist.inc(name)
                    nercount = nercount + 1
	    a = [f, tokencount, nercount,fdist.keys(), fdist.values()]
	    print a
            
	    #mycsv = csv.writer(ofile)
            mycsv.writerow(a)

mycsv2 = csv.writer(namefile)
for word in bigfdist:
    thepair = word+ ',' + str(bigfdist[word])
    mycsv2.writerow(thepair)

mycsv.close()
mycsv2.close()
   
    in_str = sys.stdin.read(BUF_SIZE)
    rest = ''

    read_count = 0

    while (rest + in_str).strip() != '':
        read_count += 1

        if read_count % 100 == 0:
            sys.stderr.write('.')
            sys.stderr.flush()

        tokens = (rest + in_str).split()
        rest = tokens.pop()

        if not tokens:
            vocab.update(rest)
            break
        else:
            vocab.update(tokens)

        in_str = sys.stdin.read(BUF_SIZE)

    print

    for i in [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]:
        if i > len(vocab.values()):
            break

        print "vocab size %7d - cutoff = %d" % (i, vocab.values()[i])
Example #43
0
class Model():
    
    def __init__(self ):
        self.__letters = {'q', 'w', 'e', 'r', 't', 'y', 
                          'u', 'i', 'o', 'p', 'a', 's', 
                          'd', 'f', 'g', 'h', 'j', 'k', 
                          'l', 'z', 'x', 'c', 'v', 'b', 
                          'n', 'm', 
                          u'\xe0', u'\xe1', u'\xe8', u'\xe9', u'\xec', u'\xed', 
                          u'\xf2', u'\xf3', u'\xf9', u'\xfa'
                          }
            
        self.__encodings = ['utf8', 'iso-8859-1']
        self.__defaultTag = ''
        self.__defTagger = nltk.DefaultTagger(self.__defaultTag)
        self.__tags = {'NOUN', 'ADV', 'ADJ', 'PRON', 'DPREP', 'VERB', 'NUM', 'PREP', 'ART', 'CONJ', 'PRONVERB', 'PUNCT', 'SPECIAL'}
        self.__manualTags = {tag: set() for tag in self.__tags}
        
        self.__tmpPath =  os.path.dirname(os.path.abspath(__file__))+ '\\'
        self.parseSyntaxRules()
        
    def initFields(self):
        
        self.__allowedForeign = set(codecs.open(self.__tmpPath + 'allowedForeign.txt', encoding='utf-8').read().split())
        self.__ignoredCommon = set(codecs.open(self.__tmpPath + 'commonIgnoredWords.txt', encoding='utf-8').read().split())
        self.__ignoredColl = set(codecs.open(self.__tmpPath + 'wordsIgnoredInCollocations.txt', encoding='utf-8').read().split())
        self.__concordanceIndex = None
        
        
    ################################
    #                              #
    #           GETTERS            #
    #                              #
    ################################
        
    ######### GENERAL DATA #########
    def getSentences(self):
        return self.__sentences
        
    def getTokens(self):
        return self.__tokens
        
    def getRawText(self):
        return self.__rawText
        
    def getWordCount(self):
        return self.__wordCount
        
    def getWordTypesCount(self):
        return len(self.__freqDist.items())
        
    def getAvgWordLength(self):
        return self.__avgWordLength
    
    def getAvgSentLength(self):
        return self.__avgSentLength
    
    def getLexicalDiversity(self):
        return self.__lexicalDiversity
        
    ######### FREQUENCY TAB #########
    def getIgnoredCommon(self):
        return self.__ignoredCommon

    def setIgnoredCommon(self, value):
        self.__ignoredCommon = value

    def getMostCommon(self, count):
        out = []
        i=0
        while len(out)<count:
            if self.__freqDist.items()[i][0] not in self.__ignoredCommon:
                out.append(self.__freqDist.items()[i])
            i+=1
        return out
    
    def getHapaxes(self):
        return self.__freqDist.hapaxes()
    
    def getHapaxPercentage(self):
        return round(len(self.__freqDist.hapaxes())*100/float(len(self.__freqDist.items())), 2)
    
        ### ZIPF'S PLOT
    def getRelZipfError(self):
        return self.__relZipfError

    def getLogfreqDist(self):
        return self.__logfreqDist

    def getLogX(self):
        return self.__logx

    def getPoly(self, x):
        return np.poly1d(self.__polyFit)(x)

    def getPolyFit(self):
        return self.__polyFit
    
    
    ########## PATTERNS TAB #########
    def setAllowedForeignWordSet(self, newSet):
        self.__allowedForeign = newSet
            
    def getAllowedForeignWordSet(self):
        return self.__allowedForeign
        
    def getForeignWords(self):
        return self.__foreignWords
    
    def getForeignWordsCount(self):
        return self.__foreignWordsCount
        
    def getForeignPercentage(self):
        return round(self.__foreignWordsCount*100/float(self.__wordCount), 2)
    
    def getPatternWords(self):
        return self.__patternWords
    
    def getPatternWordsCount(self):
        return self.__patternWordsCount
        
    def getPatternPercentage(self):
        return round(self.__patternWordsCount*100/float(self.__wordCount), 2)
    
    
    
    ### PARTS OF SPEECH TAGGING TAB
    def getTokensFromPOSCorpus(self):
        return self.__POStokens
        
    def getTokensCount(self):
        return len(self.__tokens)
        
    def getTaggedTokensCount(self):
        return len(self.__POStokens)
        
    def getTagCount(self):
        return self.__tagCount
    
    def getTagErrorCount(self):
        return self.__tagErrorCount
    
    def getTaggedCorpus(self):
        return self.__taggedCorpus
        
    def getWrongTags(self):
        return self.__wrongTags
        
    ####### COLLOCATIONS TAB ######
    
    def getIgnoredColl(self):
        return self.__ignoredColl

    def setIgnoredColl(self, value):
        self.__ignoredColl = value

    def getCollocations(self):
        return self.__collocations
        
    ################################
    #                              #
    #           METHODS            #
    #                              #
    ################################
        
    def loadCorpus(self, path):
        
        for encoding in self.__encodings:

            try:
                self.__path = path
                fileName = codecs.open( self.__path,'r', encoding=encoding )
                self.__rawText = fileName.read()
                break
            
            except UnicodeDecodeError:
                encoding = ''
                continue
                 
        if encoding!='':
            self.initFields()
            
            #SENTENCES
            # more abbreviations with dots
            punkt_param = PunktParameters()
            punkt_param.abbrev_types = set(['dr', 'vs', 'n', 'v', 'etc', 'art', 'p', 'Cost', 'ss', 'pag'])
            
            punkt_param = PunktParameters()
            sentence_splitter = PunktSentenceTokenizer(punkt_param)
            text = re.sub(ur'[\'\<\>`’]', ' ', self.__rawText)
            #text = re.sub('(\d+)', r' \1 ', text)
            sentences = sentence_splitter.tokenize(text)
            
            #TOKENS
            self.__tokens = [[token, ''] for token in list(itertools.chain(*[ customWordtokenize(sent) for sent in sentences]))]
            wordTokenizer = RegexpTokenizer('[a-zA-Z0-9\xe0\xe1\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa]+')
            #wordTokenizer = RegexpTokenizer('[\w]+')
            
            sentences = [wordTokenizer.tokenize(sent.lower()) for sent in sentences if len(wordTokenizer.tokenize(sent)) > 0]
            words =  list(itertools.chain(*sentences))
            self.__words = words
            self.__sentences = sentences
            
            self.__avgSentLength = round(np.mean( [len(sent) for sent in sentences]), 3)
            self.__avgWordLength = round(np.mean( [len(word) for word in words]), 3)
            self.__freqDist = FreqDist(words)
            self.__wordCount = len(words)
            self.__lexicalDiversity = round(len(self.__freqDist.items())/float(len(words)), 5)
            
            ### resetting members
            self.__concordanceIndex = None
            self.__bigrams = None
                 
        return encoding
    
    def computeZipf(self, unit):
        
        if unit == 'word':
            self.__logx = np.array([math.log(i, 10) for i in  range(1, len(self.__freqDist.values())+1) ] )
            self.__logfreqDist = np. array([math.log(i, 10) for i in self.__freqDist.values() ])
        
        if unit == 'bigram':
            
            bigramFreqDist = dict()
            for first in self.__letters:
                for second in self.__letters:
                    bigramFreqDist[first+second] = 0
            
            for token in self.__freqDist.items():
                for ii in range(len(token[0])-1):
                    try:
                        bigram = token[0][ii]+token[0][ii+1]
                        bigramFreqDist[bigram] += token[1]
                    except KeyError:
                        print "Key error on token: ", token

            self.__sortedBigrams = sorted([x for x in bigramFreqDist.items() if x[1]>0], key=itemgetter(1))
            self.__sortedBigrams.reverse()
            self.__logx = np.array([math.log(i, 10) for i in  range(1, len(self.__sortedBigrams)+1) ] )
            self.__logfreqDist = np. array([math.log(i[1], 10) for i in self.__sortedBigrams])
            
        if unit == 'letter':

            letterFreqDist = dict()
            for letter in self.__letters:
                    letterFreqDist[letter] = 0
                
            for token in self.__freqDist.items():
                for ii in range(len(token[0])):
                    try:
                        letter = token[0][ii]
                        letterFreqDist[letter] += token[1]
                    except KeyError:
                        print "Key Error on token: ", token
                        
            self.__sortedLetters = sorted([x for x in letterFreqDist.items() if x[1]>0], key=itemgetter(1))
            self.__sortedLetters.reverse()
            self.__logx = np.array([math.log(i, 10) for i in  range(1, len(self.__sortedLetters)+1) ] )
            self.__logfreqDist = np. array([math.log(i[1], 10) for i in self.__sortedLetters])
        
        self.__polyFit = np.polyfit(self.__logx, self.__logfreqDist, 1)
        
        poweredPoly = [np.power(10, self.getPoly( self.__logx[i] ) ) for i in  range(len(self.__logx))]
        relativeErrors = [ abs( self.__freqDist.values()[i] - poweredPoly[i] )
                                        / float( self.__freqDist.values()[i] ) for i in  range(len(self.__logx)) ]
        
        self.__relZipfError = np.mean( relativeErrors ) * 100
        
    def prepareFreqDist(self, areBigramsChecked):
        
        if areBigramsChecked:
            return self.__sortedBigrams
        else:
            return self.__sortedLetters            
          
          
    ######### PATTERNS TAB ###########
    def findForeignWords(self, rules):
        foreignWords = []
        if 'consonant' in rules:
            cond = re.compile('.*[qrtpsdfghlzcvbnm]$')
            foreignWords += [item for item in self.__freqDist.items() if cond.match(item[0]) and len(item[0])>2]
            
        if 'wyjkx' in rules:
            cond = re.compile('.*[wyjkx].*')
            foreignWords += [item for item in self.__freqDist.items() if cond.match(item[0]) and len(item[0])>2]
        
        self.__foreignWords = [item for item in foreignWords
                               if item[0] not in self.__allowedForeign]
        self.__foreignWordsCount = sum([word[1] for word in self.__foreignWords])

    def findPatternWords(self, pattern):
        
        try:
            cond = re.compile(unicode(pattern))
            self.__patternWords = [item for item in self.__freqDist.items() if cond.match(item[0])]
            self.__patternWordsCount = sum([word[1] for word in self.__patternWords])
            return 0
        except re.error:
            return -1


    ######## PARTS OF SPEECH TAGGING TAB ##########
    
    def loadPOSCorpus(self, path):
        
        for encoding in self.__encodings:
            try:
                POSfile = codecs.open(path,'r',encoding=encoding)
                POScorpus = []
                for line in POSfile.readlines():
                    words = line.split()
                    if len(words) > 1:
                        if words[1] in {'NOUN', 'ADV', 'ADJ', 'PRON', 'DPREP', 'VERB', 'NUM', 'PREP', 'ART', 'CONJ', 'PRONVERB', 'PUNCT', 'SPECIAL'}:
                            POScorpus.append((words[0], words[1]))
                        else:
                            print 'Unknown tag!: ' + words[1]
                POSfile.close()
                break
            except UnicodeDecodeError:
                print 'UnicodeDecodeError'
                 
            except UnicodeEncodeError:
                print 'UnicodeEncodeError'
                
        self.__taggedCorpus = POScorpus
        
    def applyTaggers(self, taggers, fromPOSCorpus = False):
        
        self.resetTags(fromPOSCorpus)
            
        for tagger in taggers:
            if tagger == 'manual':
                self.applyManualTagger(self.__POStokens if fromPOSCorpus else self.__tokens)
            if tagger == 'regex':
                self.applyRegexTagger(self.__POStokens if fromPOSCorpus else self.__tokens)
            if tagger == 'syntax':
                self.applySyntaxTagger(self.__POStokens if fromPOSCorpus else self.__tokens)
            if tagger == 'probability':
                self.applyProbabilityTagger(self.__POStokens if fromPOSCorpus else self.__tokens)
                
        tagCount = 0
        notTagged = []
        self.__wrongTags = []
        if fromPOSCorpus:
            errorCount = 0
            #wrongTags = []
            for i  in range(len(self.__POStokens)):
                if self.__POStokens[i][1]!=self.__defaultTag:
                    tagCount+=1
                    if self.__POStokens[i][1] != self.__taggedCorpus[i][1]:
                        errorCount+=1
                        #wrongTags.append((i, self.__POStokens[i][0], self.__POStokens[i][1], self.__taggedCorpus[i][1]))
                        self.__wrongTags.append([str(i+1), self.__POStokens[i][0], self.__POStokens[i][1], self.__taggedCorpus[i][1]])
                else:
                    notTagged.append(self.__POStokens[i][0])
                    
            self.__tagErrorCount = errorCount
                   
        else:
            for token in self.__tokens:
                if token[1]!=self.__defaultTag:
                    tagCount+=1
                else:
                    notTagged.append(token[0])
        self.__tagCount = tagCount
            
    def resetTags(self, fromPOSCorpus=False):
        if fromPOSCorpus:
            self.__POStokens = self.__defTagger.tag([token[0] for token in self.__taggedCorpus])
        else:
            self.__tokens = self.__defTagger.tag([token[0] for token in self.__tokens])

    def applyManualTagger(self, tokens):

        for line in codecs.open(self.__tmpPath + 'manualTaggingRules.txt', encoding='utf-8').readlines():
            if len(line)>4 and  line[0]!= '#':
                words = line.split()
                self.__manualTags[words[0]] = self.__manualTags[words[0]].union(set(words[1:]))
            
        for i in range(len(tokens)):
            if tokens[i][1] == self.__defaultTag:
                for tag in self.__manualTags:
                    if tokens[i][0].lower() in self.__manualTags[tag]:
                        tokens[i] = (tokens[i][0], tag)

    def applyRegexTagger(self, tokens):
        
        self.__regexTagRules = dict()
        for line in set(codecs.open(self.__tmpPath + 'regexpTaggingRules.txt', encoding='utf-8').readlines()):
            if len(line)>4 and  line[0]!= '#':
                words = line.split()
                self.__regexTagRules[re.compile(unicode(words[1]))] = (words[0], words[2:])
        
        for i in range(len(tokens)):
            if tokens[i][1] == self.__defaultTag:
                for rule in self.__regexTagRules:
                    if tokens[i][1] == self.__defaultTag:
                        word = tokens[i][0].lower()
                        if word not in self.__regexTagRules[rule][1] and rule.match(word):
                            tokens[i] = (tokens[i][0], self.__regexTagRules[rule][0])

    def parseSyntaxRules(self):
        self.__syntaxTagRules = []
        for line in set(codecs.open(self.__tmpPath + 'syntaxTaggingRules.txt', encoding='utf-8').readlines()):
            if line[0]!= '#':
                words = line.split()
                before = []
                after = []
                insertedTag = ""
                i = 0
                for i in range(0, len(words)):
                    if words[i][0] == '$':
                        insertedTag = words[i][1:]
                        break
                    before.append(words[i])
                for j in range(i+1, len(words)):
                    after.append(words[j])
                if (insertedTag!="" and (before!=[] or after!=[])):
                    self.__syntaxTagRules.append(SyntaxTaggingRule(before, insertedTag, after))

    def applySyntaxTagger(self, tokens):
        self.parseSyntaxRules()
        for i in range(len(tokens)):
            if tokens[i][1] == self.__defaultTag:
                
                for rule in self.__syntaxTagRules:
                    #rule lenghts check
                    if i >= len(rule.before) and len(tokens) - i >= len(rule.after):
                        poniechaj = False
                        tagsCount = len(rule.before)

                        for before_it in range(tagsCount):
                            if rule.before[before_it] in self.__tags:
                                if tokens[i - tagsCount + before_it][1] != rule.before[before_it]:
                                    poniechaj = True
                            else:
                                if tokens[i - tagsCount + before_it][0] != rule.before[before_it]:
                                    poniechaj = True
                                    
                        if (poniechaj):
                            continue
                        
                        for after_it in range(len(rule.after)):
                            if rule.after[after_it] in self.__tags:
                                if rule.after[after_it] != tokens[i + 1 + after_it][1]:
                                    poniechaj = True
                            else:
                                if rule.after[after_it] != tokens[i + 1 + after_it][0]:
                                    poniechaj = True
                                
                        if (poniechaj):
                            continue
                                
                        tokens[i] = (tokens[i][0], rule.tag)
                        break
        
    def findMostCommonTag(self):
        tagsFreqDistMap = dict()
        for word in self.__taggedCorpus:
            if word[0] not in tagsFreqDistMap.keys():
                tagsFreqDistMap[word[0]] = FreqDist([word[1]])
            else:
                tagsFreqDistMap[word[0]].inc(word[1])
        
        self.__mostCommonTagMap = dict()
        for word in tagsFreqDistMap.keys():
            self.__mostCommonTagMap[word] = tagsFreqDistMap[word].keys()[0]
        
    def applyProbabilityTagger(self, tokens):
        self.findMostCommonTag()
        
        for i in range(len(tokens)):
            if tokens[i][1] == self.__defaultTag:
                if tokens[i][0] in self.__mostCommonTagMap.keys():
                    tokens[i] = (tokens[i][0], self.__mostCommonTagMap[tokens[i][0]])
        
    def getTaggingRules(self, tagger):
        f = codecs.open(self.__tmpPath + tagger + 'TaggingRules.txt', encoding='utf-8')
        rules = f.read()
        f.close()
        return rules

    def setTaggingRules(self, tagger, rules):
        f = codecs.open(self.__tmpPath + tagger + 'TaggingRules.txt', 'w', encoding='utf-8' )
        f.seek(0)
        f.write(rules)
        f.truncate()
        f.close()
        
    ########## COLLOCATIONS TAB ############
    def collIgnoreListHasChanged(self):
        self.__collIgnoredListChanged = True
        
    def findCollocations(self, test, window, minFreq, count, searchedWord):

        if self.__bigrams == None or self.__collIgnoredListChanged or self.__collCurrentWindow != window or self.__collCurrentSearchedWord != searchedWord or self.__collCurrentMinFreq != minFreq:
            self.prepareBigrams(window, searchedWord)
            self.__collIgnoredListChanged = False
            
        self.__bigrams.apply_freq_filter(minFreq)
        self.__collCurrentWindow = window
        self.__collCurrentSearchedWord = searchedWord
        self.__collCurrentMinFreq = minFreq
        
        bfd = self.__bigrams.getBigramFd()
        scored_bigrams = []
        bigram_measures = nltk.collocations.BigramAssocMeasures()

        if test == 'Raw Frequency':
            scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.raw_freq)[:count]
            
        if test == 'T Student Test':
            scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.student_t)[:count]
            
        if test == 'Pearson\'s Chi Square Test':
            scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.chi_sq)[:count]
            
        if test == 'Pointwise Mutual Information':
            scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.pmi)[:count]
            
        if test == 'Dice':
            scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.dice)[:count]
            
        if test == 'Jaccard':
            scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.jaccard)[:count]

        if test == 'Likelihood Ratio':
            scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.likelihood_ratio)[:count]
            
        if test == 'Variant of Mutual Information':
            scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.mi_like)[:count]
            
        if test == 'Poisson Stirling':
            scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.poisson_stirling)[:count]
            
        if test == 'Phi square':
            scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.phi_sq)[:count]
            
        self.__collocations = [[unicode(x[0]+' '+x[1]), y, bfd[x]] for x,y in scored_bigrams]

    def prepareBigrams(self, window_size, word):
        wfd = FreqDist()
        bfd = FreqDist()
        
        if word == '':
            for sentence in self.__sentences:
                if len(sentence) > 1:
                    for window in ingrams(sentence, window_size, pad_right=True):
                        if window[0] not in self.__ignoredColl:
                            w1 = window[0]
                            try:
                                window = window[:list(window).index(w1, 1)]
                            except ValueError:
                                pass
                            wfd.inc(w1)
                            for w2 in set(window[1:]):
                                if w2 is not None and w2 not in self.__ignoredColl:
                                    bfd.inc((w1, w2))
        else:
            for sentence in self.__sentences:
                if len(sentence) > 1:
                    for window in ingrams(sentence, window_size, pad_right=True):
                        if window[0] not in self.__ignoredColl:
                            w1 = window[0]
                            try:
                                window = window[:list(window).index(w1, 1)]
                            except ValueError:
                                pass
                            bigramOK = False
                            for w2 in set(window[1:]):
                                if w2 is not None and w2 not in self.__ignoredColl and (w1 == word or w2==word):
                                    bfd.inc((w1, w2))
                                    bigramOK = True
                            if bigramOK:
                                wfd.inc(w1)
                                
        self.__bigrams = MyBigramCollFinder(wfd, bfd)

    ######### CONTEXT TAB ###########
    def findWordContext(self, word, lines, wordCount):
        
        if not self.__concordanceIndex:
            self.__concordanceIndex = nltk.ConcordanceIndex([token[0] for token in self.__tokens],
                                                            key=lambda s:s.lower())
            
        contexts = []
        offsets = self.__concordanceIndex.offsets(unicode(word))
  
        if offsets:
            lines = min(lines, len(offsets))
            for i in offsets:
                if lines <= 0:
                    break
                left = (' '.join([token[0] for token in self.__tokens[i-wordCount:i]]))
                right = ' '.join([token[0] for token in self.__tokens[i+1:i+wordCount+1]])
                contexts.append( left + ' ' + self.__tokens[i][0].upper() + ' ' + right)
                lines -= 1
        return contexts