Example #1
0
def get_word_bigram_scores(pos_words, neg_words):
    pos_words_plain = list(itertools.chain(*pos_words))
    neg_words_plain = list(itertools.chain(*neg_words))

    bigram_finder = BigramCollocationFinder.from_words(pos_words_plain)
    pos_bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    bigram_finder = BigramCollocationFinder.from_words(neg_words_plain)
    neg_bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = pos_words_plain + pos_bigrams  # 词和双词搭配
    neg = neg_words_plain + neg_bigrams
    all_words = pos + neg

    pos_word_fd = FreqDist(pos)
    neg_word_fd = FreqDist(neg)
    word_fd = FreqDist(all_words)

    pos_word_count = pos_word_fd.N()  # 积极词的数量
    neg_word_count = neg_word_fd.N()  # 消极词的数量
    #total_word_count = pos_word_count + neg_word_count
    total_word_count = word_fd.N()

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(pos_word_fd[word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(neg_word_fd[word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores
Example #2
0
    def work_1():
        file_string = ""
        txt_file = open("trabalho1.txt", "r+")
        csv_file = open("trabalho1.csv", "w+")
        csv_manage = csv.writer(csv_file,
                                delimiter=";",
                                quoting=csv.QUOTE_MINIMAL)
        base_text = txt_file.read()
        sentences = word_tokenize(base_text)
        frequency = FreqDist(sentences)

        print("texto : {0}".format(base_text))

        print("Total de palavras : {0}".format(frequency.N()))
        print("Total de Termos : {0}".format(len(frequency.keys())))
        print("")

        print("Tabela de Frequência de Termos")
        print("")

        for key in frequency.keys():
            csv_manage.writerow([key, str(frequency.get(key))])
            print("Termo: {0}  Total: {1}".format(key,
                                                  str(frequency.get(key))))

        pdfOutput = PdfOutput(frequency, frequency.N(), len(frequency.keys()),
                              base_text)
        servicePdfManager = ServiceManagerPdf()
        servicePdfManager.writePdf(pdfOutput)

        txt_file.close()
        csv_file.close()
Example #3
0
    def compute_scores(self, collocate_data, role=None, category=None):
        data = collocate_data
        if role:
            data = [x for x in collocate_data if x['role'] == role]
        if category:
            data = [x for x in collocate_data if x['category'] == category]

        dialogue_fd = FreqDist([w for d in data for w in d['dialogue_tokens']])
        context_fd = FreqDist([w for d in data for w in d['context_tokens']])
        collocate_fd = FreqDist([(w_c, w_d) for d in data
                                 for w_c in d['context_tokens']
                                 for w_d in d['dialogue_tokens']])

        N_d = dialogue_fd.N()
        N_c = context_fd.N()
        N_cd = collocate_fd.N()

        scores = defaultdict(dict)
        const = np.log(N_c) + np.log(N_d) - np.log(N_cd)
        for pair in collocate_fd:
            w_c, w_d = pair
            s = np.log(collocate_fd[pair]) - np.log(dialogue_fd[w_d]) - np.log(
                context_fd[w_c]) + const
            scores[w_d][w_c] = s

        return dict(scores)
Example #4
0
class FrequenceVocabulary:
    """
    Vocabulary that contains words frequency estimated from
    words count in files specified.
    """
    def __init__(self, miss_f):
        """
        Construct new vocabulary with function that computes word probability
        for words which absent in vocabulary. Example usage:

            >>> miss_f = lambda key, N: 10. / (N * 10 ** len(key))

        :param miss_f: function for estimating probability of missing words.
        """
        self.vocab = FreqDist()
        self._miss_f = miss_f

    def load_vocab(self, root='.', files='.*'):
        """
        Load new vocabulary.

        :param root: the root directory for the corpus.
        :param files: A list or regexp specifying the files in this corpus.
        """
        voc = PlaintextCorpusReader(root, files)
        for word in voc.words():
            self.vocab[word.lower()] += 1

    def p(self, key):
        """
        :param key: word to compute it's probability
        :return: A probability distribution computed for key.
        """
        return 1. * self.vocab[key] / self.vocab.N() if key in self.vocab.keys(
        ) else self._miss_f(key, self.vocab.N())
Example #5
0
    def _prepare(self):
        if self._is_prepared:
            return

        freq_dist_a = FreqDist()
        for a in self._pair.chunks_a:
            freq_dist_a.update(self._tokenize(a))

        freq_dist_b = FreqDist()
        for b in self._pair.chunks_b:
            freq_dist_b.update(self._tokenize(b))

        self._avg_freq_dist = FreqDist()
        n_a = freq_dist_a.N()
        n_b = freq_dist_b.N()
        for a in freq_dist_a:
            self._avg_freq_dist[a] = (freq_dist_a[a] / n_a + freq_dist_b[a] / n_b) / 2.0
        for b in freq_dist_b:
            if self._avg_freq_dist[b] != 0.0:
                continue
            self._avg_freq_dist[b] = (freq_dist_a[b] / n_a + freq_dist_b[b] / n_b) / 2.0

        self._chunks = self._sampler.generate_chunk_pairs(self._pair)

        self.__freq_a = None
        self.__freq_b = None

        self._is_prepared = True
def term_ratio(tf1: FreqDist, tf2: FreqDist, c=None, normalize=False):
    if normalize:
        if c is None:
            c = 1e-4
        return {
            word: (tf1[word] / tf1.N()) / (tf2[word] / tf2.N() + c)
            for word in tf1.keys()
        }
    else:
        if c is None:
            c = 1
        return {word: tf1[word] / (tf2[word] + c) for word in tf1.keys()}
Example #7
0
def freq(inp, outp):
    """Input: a text file
    Output: a table of word frequency with three columns for Word, Count and Percent frequency
    """
    text = open(inp, 'r').read()

    sents = nltk.sent_tokenize(text)
    all_words = []
    for sent in sents:
        words = nltk.word_tokenize(sent)
        all_words += words

    all_words = [x.lower() for x in all_words]

    freq = FreqDist(all_words)
    tot = float(freq.N())

    # output

    o = open(outp, 'w')
    o.write("Word\tCount\tPercent\n")
    for pair in freq.most_common():
        o.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair,
                                                          pc=100 * pair[1] /
                                                          tot))
    o.close()
Example #8
0
def count_pos(input, language):
    if language == 'english-nltk':
        words = word_tokenize(input)
        pos = pos_tag(words)

    elif language == 'english':
        s = pattern.en.parsetree(input, relations=True, lemmata=True)
        words = []
        pos = []
        for sentence in s:
            for w in sentence.words:
                words.append(w.string)
                pos.append((w.string, clean_text.clean_pos(w.type)))

    elif language == 'spanish':
        s = pattern.es.parsetree(input, relations=True, lemmata=True)
        words = []
        pos = []
        for sentence in s:
            for w in sentence.words:
                words.append(w.string)
                pos.append((w.string, clean_text.clean_pos(w.type)))

    elif language == 'dutch':
        words = word_tokenize(input, 'dutch')
        tagger = nltk.data.load('taggers/alpino_aubt.pickle')
        pos = tagger.tag(words)

    tags = FreqDist(tag for (word, tag) in pos)
    relative_frequency = []
    for item in tags.items():
        relative_frequency.append((item[0], float(item[1]) / tags.N()))
    return relative_frequency
Example #9
0
def extract_doc_feats_counts(refactorized_documents):
    from nltk import FreqDist
    from collections import defaultdict
    import itertools
    import math
    import pdb
    import numpy

    doc_num = len(refactorized_documents)

    ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents))
    glob_freqs = FreqDist(ref_docs_flat)

    tokens = glob_freqs.samples()

    for i in range(0, doc_num):
        doc_features = [0] * len(tokens)
        doc_freqs = FreqDist(refactorized_documents[i])

        for (tok, freq) in doc_freqs.items():
            indx = tokens.index(tok)
            doc_features[indx] = freq * doc_freqs.N()

        f_tmp = numpy.asarray(doc_features)
        glob_features[i] = f_tmp.tolist()

    return (glob_features, tokens)
Example #10
0
class Vocab:
    def __init__(self,
                 tokens: List[Tokens],
                 special_symbols: List[str] = None):
        special_symbols = [] if special_symbols is None else special_symbols
        special_symbols = special_symbols + [
            "<eot>", "<response>", "<eos>", "<unk>", "<pad>", "<bos>"
        ]
        self.vocab = FreqDist()
        self.cdf = 0.
        for sample in tokens:
            for token in sample:
                if token not in special_symbols:
                    self.vocab[token] += 1

        print(
            f"total samples in vocab: {self.vocab.N()}, total tokens in vocab: {self.vocab.B()}"
        )
        self.itos = []
        self.stoi = {}

    def fit(self, num_tokens=15000):
        cdf = 0.
        for cdf in self.vocab._cumulative_frequencies(
            [i[0] for i in self.vocab.most_common(num_tokens)]):
            pass
        self.cdf = cdf / self.vocab.N()
        print(
            f"cdf of the {num_tokens} most common tokens in vocab {self.cdf}")
        self.itos = ["<unk>", "<pad>", "<eos>", "<bos>"] + [
            tup[0] for tup in self.vocab.most_common(num_tokens)
        ]
        self.stoi = Counter(
            {key: index
             for index, key in enumerate(self.itos)})
Example #11
0
def extract_doc_feats(refactorized_documents):
    from nltk import FreqDist
    from collections import defaultdict
    import itertools
    import math
    import pdb
    import numpy

    doc_num = len(refactorized_documents)

    occurences = defaultdict(lambda: 0)
    for doc in refactorized_documents:
        for x in set(doc): occurences[x] += 1

    ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents))
    glob_freqs = FreqDist(ref_docs_flat)

    tokens = glob_freqs.samples()
    glob_features = [{}]*doc_num


    for i in range(0, doc_num):
        doc_features = [0]*len(tokens)
        doc_freqs = FreqDist(refactorized_documents[i])
        doc_len = len(refactorized_documents[i])

        for (tok,num) in doc_freqs.items():
            max_doc_freq = doc_freqs.freq(doc_freqs.max())*float(doc_len)

            # augmented
            #tf = 0.5 + (0.5*float(num)) / float(max_doc_freq)
            tf = 1+math.log(num,10)
            idf = math.log( float(doc_num) / (float(occurences[tok])) ,10)
            tfidf = tf*idf

            indx = tokens.index(tok)
            doc_features[indx] = tfidf

        f_tmp = numpy.asarray(doc_features)
        f_tmp = f_tmp/(numpy.linalg.norm(f_tmp)+numpy.finfo(float).eps)
        glob_features[i] = f_tmp.tolist()

    glob_features = numpy.asarray(glob_features)*glob_freqs.N()
    print "Glob Freqs:", glob_freqs.N()

    return (glob_features,tokens)
Example #12
0
def paper_title_NLP(title_corpus):

    # title_corpus is a list of tuple
    # keys like (19,1), means 2019/01
    # value is a list of paper titles after tokenized
    # referece: https://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer
    title_dict = {}
    pattern = r'''(?x)            # set flag to allow verbose regexps
            (?:[A-Z]\.)+          # abbreviations, e.g. U.S.A.
            | \w+(?:-\w+)*        # words with optional internal hyphens
            | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
            | \.\.\.              # ellipsis
            | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
            '''
    tokenizer = RegexpTokenizer(pattern)
    for t in title_corpus:
        key = (t[3], t[4])
        if key in title_dict:
            filterdText = tokenizer.tokenize(t[1])
            title_dict[key].append(filterdText)
        else:
            title_dict[key] = []
            filterdText = tokenizer.tokenize(t[1])
            title_dict[key].append(filterdText)

    # extract keywords with year span
    title_years = {}
    for k, v in title_dict.items():
        key = (k[0], )  # year index
        if key in title_years.keys():
            title_years[key].append(v)
        else:
            title_years[key] = []
            title_years[key].append(v)

    deep_freq = []
    for k, v in title_years.items():
        fd = FreqDist()
        vs = [item for sublist in v for item in sublist]
        for v_ in vs:
            for word in v_:
                fd[word] += 1

        print('The keywords for year:20{}'.format(str(k[0])))
        print("Total number of words:{}".format(str(
            fd.N())))  # total number of samples
        print("Total number of unique words:{}".format(str(
            fd.B())))  # number of bins or unique samples
        fd.pprint(50)  # The maximum number of items to display, default is 10
        deep_freq.append(fd.freq('Deep') + fd.freq('deep'))
        print(deep_freq)

    plt.plot([2012, 2013, 2014, 2015, 2016, 2017, 2018], deep_freq)
    plt.ylabel('frequency of deep word')
    plt.xlabel('years')
    plt.show()
Example #13
0
    def kneser_ney(self, context, word):
        """
        Return the log probability of a word given a context given
        Kneser Ney backoff
        """

        bgram = (context, word)
        unigram_freq = FreqDist()

        theta = self._kn_concentration
        vocabulary = 1 / len(self._vocab_freq.keys())
        discount_delta = self._kn_discount
        unigram_T = len(self._context_freq.keys())
        bigram_T = self._context_freq[context]

        for i in self._gram_freq:
            unigram_freq.inc(i[1])

        # Unigram Restaurant
        # C_0,x
        count_unirest_wordTable = unigram_freq[word]
        # C_0,.
        count_unirest_allTable = unigram_freq.N()

        # u_Bigram Restaurant
        # C_u,x
        count_birest_wordTable = self._gram_freq[bgram]

        # C_u,.
        count_birest_allTable = self._context_freq[context]

        existingTable_numer = count_birest_wordTable - discount_delta
        existingTable_denom = theta + count_birest_allTable
        existingTable = existingTable_numer / existingTable_denom

        if existingTable < 0:
            existingTable = 0

        newTable_numer = theta + (bigram_T * discount_delta)
        newTable_denom = theta + count_birest_allTable
        newTable = newTable_numer / newTable_denom

        back_a_numer = count_unirest_wordTable - discount_delta
        back_a_denom = count_unirest_allTable + theta
        back_a = back_a_numer / back_a_denom
        if back_a < 0:
            back_a = 0

        back_b_numer = theta + (unigram_T * discount_delta)
        back_b_denom = count_unirest_allTable + theta
        back_b = back_b_numer / back_b_denom
        back_b = back_b * vocabulary

        result = existingTable + (newTable * (back_a + back_b))
        return lg(result)
Example #14
0
def extract_ngrams(text,
                   low=1,
                   high=2,
                   lowercase=False,
                   filter_punctuation=True,
                   binary=False,
                   least_common=None,
                   most_common=None,
                   normalize=False,
                   sample=False):
    #text = ' '.join(review.paragraphs)
    tokens = None

    # Make lowercase
    if lowercase:
        tokens = word_tokenize(text.lower())
    else:
        tokens = word_tokenize(text)

    # Remove Punctuation
    if filter_punctuation:
        words = [t for t in tokens if t not in PUNCTUATION]
    else:
        words = [t for t in tokens]

    # Do the N Gram Thing
    ngram_counts = {}
    assert not (
        sample and binary
    ), "Please don't make sample and binary True. One or the other or neither pls"
    for n in range(low, high + 1):
        ngram_freqdist = FreqDist(ngrams(words, n))
        grams_to_consider = ngram_freqdist
        if least_common:
            assert least_common > 0.0 and least_common <= 1.0, \
                    'Least common must be a proportion, not %.3f' % least_common
            num_least_common = int(least_common * ngram_freqdist.N())
            grams_to_consider = []
            for bleh in ngram_freqdist.most_common()[-1 * num_least_common:]:
                gram, count = bleh
                grams_to_consider.append(gram)
        for gram in grams_to_consider:
            if sample:
                ngram_counts[gram] = ngram_freqdist.freq(gram)
            elif binary:
                ngram_counts[gram] = True
            else:
                ngram_counts[gram] = ngram_freqdist[gram]
    if normalize:
        total_counts = sum(count for ngram, count in ngram_counts.items())
        for gram, count in ngram_counts.items():
            ngram_counts[gram] = count / total_counts
    return ngram_counts
Example #15
0
def show():
    print gutenberg.fileids()
    # 频率分布实例化
    fd = FreqDist()
    for word in gutenberg.words('austen-persuasion.txt'):
        fd[word] += 1

    print fd.N()
    print fd.B()
    # 得到前10个按频率排序后的词
    for word, value in sorted(fd.items(), key=lambda item: -item[1])[:10]:
        print word, value
Example #16
0
def media(entrada):
    '''
    Essa função calcula a quantidade média de caracteres
    de cada palavra do texto dado como entrada.
    '''
    fdist = FreqDist(len(w) for w in entrada)
    somaTotal = 0
    for tam in fdist.most_common():
        somaTotal += tam[0] * tam[1]

    resultadoMedia = somaTotal / fdist.N()
    return resultadoMedia
Example #17
0
def get_word_scores(pos_words, neg_words):
    pos_words_plain = list(itertools.chain(*pos_words))
    neg_words_plain = list(itertools.chain(*neg_words))
    word_fd = FreqDist(pos_words_plain + neg_words_plain)  # 可统计所有词的词频
    pos_word_fd = FreqDist(pos_words_plain)
    neg_word_fd = FreqDist(neg_words_plain)

    pos_word_count = pos_word_fd.N()  # 积极词的数量
    neg_word_count = neg_word_fd.N()  # 消极词的数量
    #total_word_count = pos_word_count + neg_word_count
    total_word_count = word_fd.N()

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(
            pos_word_fd[word], (freq, pos_word_count),
            total_word_count)  # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(neg_word_fd[word],
                                               (freq, neg_word_count),
                                               total_word_count)  # 同理
        word_scores[word] = pos_score + neg_score  # 一个词的信息量等于积极卡方统计量加上消极卡方统计量
    return word_scores  # 包括了每个词和这个词的信息量
Example #18
0
def recordFrequencyData(corpusname, csvwritter, useLogFreq=False):
    totalFQ = FreqDist()
    processed_corpus_texts = getTextFileNames(corpusname)

    for file in processed_corpus_texts:
        print("recording the file: " + file)
        if path.exists(file):
            freqs = collectFreqData(file)
            totalFQ = freqs + totalFQ

    towrite = dict()
    towrite["Subreddit"] = corpusname

    for word in getRegionalisms():
        if totalFQ[word] == 0:
            towrite[word] = 0
        else:
            if useLogFreq:
                towrite[word] = math.log(totalFQ[word] / totalFQ.N())
            else:
                towrite[word] = totalFQ[word] / totalFQ.N()
    csvwritter.writerow(towrite)
Example #19
0
    def parse(self, response):
        """
		The lines below is a spider contract. For more info see:
		http://doc.scrapy.org/en/latest/topics/contracts.html
		
		@url https://www.google.com/search?q=personal+nutrition
		@scrapes pages to depth<=3, using priority-score based BFS
		"""

        doc = clean_html(response.body_as_unicode())
        words = word_tokenize(doc)
        words = [word.lower() for word in words]
        words = [word for word in words if word not in self.stops]
        fdist = FreqDist(words)

        for word in set(words):
            if (fdist.freq(word) * fdist.N()) > 1:
                item = WordCount()
                item['word'] = word
                item['count'] = int(fdist.freq(word) * fdist.N())
                yield item
        #for href in response.css("a::attr('href')"):
        #	url = response.urljoin(href.extract())
        #	yield scrapy.Request(url, callback=self.parse)
Example #20
0
    def train(self, instances):
        """Remember the labels associated with the features of instances."""

        label_counts = FreqDist()
        feature_counts = defaultdict(FreqDist)
        all_features = set()

        #collect counts: C(feature,label) and C(label)
        for instance in instances:
            if instance.label != '':  #I'm throwing out one blog without a label that is in the corpus for some reason
                label_counts[instance.label] += 1
                features = instance.features()
                for feature in features:
                    all_features.add(feature)
                    feature_counts[instance.label][feature] += 1

        #smoothing, and also making sure that all features are counted for each label
        for label in feature_counts.keys():
            for feature in all_features:
                feature_counts[label][feature] += 1

        #P(label)
        total = label_counts.N()
        label_probs = {
            label: float(label_counts[label]) / total
            for label in label_counts
        }

        #P(feature|label) as a dictionary of dictionaries- C(feature,label)/SUM(C(feature,label) for all the features)
        feature_probs = {}
        for label in feature_counts:
            total = feature_counts[label].N()
            feature_probs[label] = {
                feature: float(feature_counts[label][feature]) / total
                for feature in feature_counts[label]
            }

        #set the model
        self.set_model({
            "label_probs": label_probs,
            "feature_probs": feature_probs,
            "all_features": all_features
        })
Example #21
0
def frequency(textfiles, out_file):
    """Input: a text file
    Output: a table of word frequency with three columns for Word, Count and Percent frequency
    """

    words = []
    for textfile in textfiles:
        with open(textfile, 'r') as fd:
            text = fd.read()

        words.extend(nltk.word_tokenize(text))

    fdist = FreqDist(words)
    total = float(fdist.N())

    with open(out_file, 'w') as output:
        output.write("Word\tCount\tPercent\n")
        for pair in sorted(fdist.items()):
            output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(
                pair=pair, pc=100 * pair[1] / total))
Example #22
0
def save_ngrams(lyrics_file):
    """Creates a file of N-grams and their occurences for the file with lyrics"""
    all_ngrams = []

    with open(lyrics_file, "r") as f:
        for line in f.readlines():
            if not line.isupper() and line != "\n":
                all_ngrams.extend(
                    list(
                        nltk.bigrams(word_tokenize(line),
                                     pad_left=True,
                                     pad_right=True,
                                     right_pad_symbol='</s>',
                                     left_pad_symbol='<s>')))

    fd = FreqDist(all_ngrams)
    ngram_stats = fd.most_common(fd.N())

    with open('ngrams.json', mode='w') as fp:
        json.dump(ngram_stats, fp)
Example #23
0
def unigramFreqFile(subreddit):
    # get filtered files
    filenames = getTextFileNames(subreddit)
    countFileName = getCountFileName(subreddit)
    with open(countFileName, "a+", errors='ignore') as countVectorFile:
        frequencies = FreqDist()
        for filename in filenames:
            print("sending normalized values of " + filename + " to " + countFileName)
            with open(filename, "r", errors="ignore") as current_file:
                for line in current_file:
                    for word in line.split():
                        word = word.strip()
                        if word.startswith("http") or word.isnumeric():
                            continue
                        if 0 < len(word) < 23:
                            frequencies[word] = frequencies.get(word, 0) + 1

        frequencies["<end_comment>"] = 0
        # write total number of words
        countVectorFile.write(str(frequencies.N()))
        for word in frequencies:
            countVectorFile.write(word+" "+str(frequencies[word])+"\n")
    def textChanged_inputTextEdit(self):
        inputText = self.inputTextEdit.toPlainText().strip()
        inputText = ''.join(c for c in inputText
                            if not ud.category(c).startswith('P')
                            )  # Delete all ponctuations (Arabic included)

        inputTokens = functions.tokenization(inputText)
        freqDist = FreqDist(inputTokens)

        self.numWordEdit.setText(str(freqDist.N()))
        self.mostFreqWordEdit.setText(freqDist.max())

        numSentences = len(
            functions.tok_stem(self.inputTextEdit.toPlainText(), False))
        self.numSentenceEdit.setText(str(numSentences))

        self.inStatsGroup.setEnabled(
            True if self.inputTextEdit.toPlainText().strip() else False)
        self.searchWordGroup.setEnabled(
            True if self.inputTextEdit.toPlainText().strip() else False)
        self.startPosTagButton.setEnabled(
            True if self.inputTextEdit.toPlainText().strip() else False)
Example #25
0
def bigramFreqFile(subreddit):
    #get filtered files
    filenames = getTextFileNames(subreddit)
    countfilename = getCountFileName(subreddit, unigram=False)
    with open(countfilename, "a+", errors='ignore') as countVectorFile:
        frequencies = FreqDist()

        #good canidate for multithreading. one thread for file, each with own freq dist, combo after all finish.
        for filename in filenames:
            print("sending normalized values of " + filename + " to " + countfilename)
            with open(filename, "r", errors="ignore") as current_file:
                for line in current_file:
                    for bigram in list(bigrams(line.split())):
                        okayrange = 0 < len(bigram[0])  < 23 and 0 < len(bigram[1]) < 23
                        if okayrange and bigram[1] != "<end_comment>":
                            frequencies[bigram] = frequencies.get(bigram, 0) + 1

        #write total number of words
        countVectorFile.write(str(frequencies.N()))

        #note, another good improvement, organize this for faster searching.
        for bigram in frequencies:
            countVectorFile.write(" ".join(bigram)+" "+str(frequencies[bigram]))
Example #26
0
        stoplist += stopwords.words('english')
        #stoplist.append("'t")

    if args.stop_punctuation:
        stoplist += [x.decode('UTF8') for x in set(list(punctuation))]
        stoplist += [u'\u201d', u'\u201c', u'\u2019', u'\u2014']
        stoplist.append('--')

    words = [word for word in word_tokenize(text) if word not in stoplist]
    if args.stem:
        st = LancasterStemmer()
        words = [st.stem(word) for word in words]

    freq_dist = FreqDist(words)

    print('Total words: ' + str(orig_freq_dist.N()))
    print('Total after filter: ' + str(freq_dist.N()))
    # B() gives list of unique words
    print('Unique words: ' + str(freq_dist.B()))
    print('Unique words ratio: ' +
          str(float(freq_dist.B()) / float(freq_dist.N())))
    print('\n')

    if args.words:
        for word in args.words:
            print(word + ': ' + str(freq_dist[word]))
            print(word + ' freq: ' + str(freq_dist.freq(word)))
            print('\n')

    # Show top 30
    print('Top ' + str(args.num_words) + ' words:')
Example #27
0
        # print x
        # get number of docs that contains word 'w'
        if x > 0:
            return 1
        return 0

    return math.log(N / (reduce(add, map(map0, xx), 1)), 2)


for col in dataset:
    vectorizer = DictVectorizer()
    document_collections = col['sentences']
    pre_matrix = []
    for d in document_collections:
        dc = FreqDist(my_tokenize(d))
        nn = float(dc.N())
        # for ix in dc:
        #     dc[ix] = dc[ix] / nn

        pre_matrix.append(dc)

    tf_matrix = vectorizer.fit_transform(pre_matrix)
    # N_doc = tf_matrix.shape[0]
    # for i in range(tf_matrix.shape[1]):
    #     idfx = idf(tf_matrix[:, i], N_doc)
    #     vv = tf_matrix[:, i]
    #     tf_matrix[:, i].multiply(idfx)
    #     ccc = 0

    col['model'] = MostRelevantSentence(vectorizer=vectorizer,
                                        collection_matrix=tf_matrix)
Example #28
0
class TrigramHMM(HMM):
    def __init__(self, stemmer=BasicStemmer(), backoff: Model = None):
        super(TrigramHMM, self).__init__(stemmer, backoff)
        self.EMISSION_MATRIX = None
        self.TRANSITION_MATRIX = None
        self.transMatrix_file_save_name = "trigram_transitionTable"
        self.emissMatrix_file_save_name = "trigram_emissionTable"

    def loadTables(self):

        if not bool(self.EMISSION_MATRIX):
            if not os.path.exists('obj/hmm/' +
                                  self.emissMatrix_file_save_name + '.json'):
                print(
                    "Emission table not found in Disk, reconstructing and saving ...."
                )
                import glob

                os.chdir(
                    position.replace("\\", "/") +
                    "/../corpus/sources/emission")
                emissionSources = [
                    os.path.abspath(el) for el in list(glob.glob("*.txt"))
                ]

                os.chdir(position)
                self.EMISSION_MATRIX = self.constructEmissionMatrix(
                    emissionSources)
                saveIndex(
                    self.EMISSION_MATRIX,
                    "obj\\hmm\\" + self.emissMatrix_file_save_name + '.pkl')
                saveIndexjson(
                    self.EMISSION_MATRIX,
                    "obj\\hmm\\" + self.emissMatrix_file_save_name + '.json')
            else:
                self.EMISSION_MATRIX = loadIndexJson(
                    "obj/hmm/" + self.emissMatrix_file_save_name + '.json')
                #self.EMISSION_MATRIX = loadIndex("obj/hmm/" + self.emissMatrix_file_save_name + '.pkl')
                print("Emission table loaded from Disk ...")

        if not bool(self.TRANSITION_MATRIX):
            if not os.path.exists(position + '/obj/hmm/' +
                                  self.transMatrix_file_save_name + '.pkl'):
                print(
                    "Transition table not found in Disk, reconstructing and saving ...."
                )
                import glob
                os.chdir("../corpus/sources/transition")
                transitionSources = [
                    os.path.abspath(el) for el in list(glob.glob("*.txt"))
                ]
                os.chdir(position)
                self.TRANSITION_MATRIX = self.constructTransitionMatrix(
                    transitionSources)
                #saveIndex(self.TRANSITION_MATRIX,"obj\\hmm\\"+self.transMatrix_file_save_name+'.pkl')
                save_obj(
                    self.TRANSITION_MATRIX,
                    "obj\\hmm\\" + self.transMatrix_file_save_name + '.pkl')
            else:
                self.TRANSITION_MATRIX = load_obj(
                    "obj/hmm/" + self.transMatrix_file_save_name + '.pkl')
                try:

                    convertedCond = [
                        tuple_parser(cond)
                        for cond in self.TRANSITION_MATRIX.conditions()
                    ]
                    cfd = ConditionalFreqDist([
                        (tuple_parser(cond), tag)
                        for cond in self.TRANSITION_MATRIX.conditions()
                        for tag in self.TRANSITION_MATRIX[cond]
                    ])

                    self.TRANSITION_MATRIX = cfd
                except Exception as e:
                    print(e)
                print("Transition table loaded from Disk ...", end=" ")

    def constructEmissionMatrix(self, sourceFilesList: list):
        # construction of the emission matrix
        emission = defaultdict(dict)
        for tag in NE_TAG_lABELS:
            emission[tag] = defaultdict(float)
        for fileName in sourceFilesList:
            file = open(fileName, 'r', encoding='windows-1256')
            for line in file:
                words = re.split("\s+", line)
                entite = ''
                for word in words:
                    word = self.stemmer.stem(word)
                    if (re.findall('[A-Z]+', word) == []):
                        entite = word

                        continue
                    if not word in emission:
                        emission[word] = defaultdict(float)

                    emission[word][entite] += 1

            file.close()

        for tag in emission.keys():
            somme = 0.0
            for value in emission[tag].values():
                somme += value
            for word in emission[tag].keys():
                emission[tag][word] = round(
                    float("{0:.6f}".format(emission[tag][word] / somme)), 6)

        self.EMISSION_MATRIX = emission
        return emission

    def constructTransitionMatrix(self, sourceFilesList: list):
        #construction of the transition matrix
        for fileName in sourceFilesList:
            file = open(fileName, 'r', encoding="windows-1256")
            fileFinal = ""
            for line in file:
                line = line.upper()
                if (len(line) > 1):
                    if not line.startswith("<S>"):
                        fileFinal += '<S> ' + line[:-1] + ' <E>\n'
                    else:
                        fileFinal += line[:-1] + '\n'
            file.close()

        tokens = [el for el in re.split("[\s\n]+", fileFinal) if el != '']
        self.initialProbabilities = FreqDist([
            tokens[i] for i in range(1, len(tokens)) if tokens[i - 1] == '<S>'
        ])

        self.tags = list(set(tokens))
        self.bigramDist = FreqDist(list(bigrams(tokens)))
        Trigrams = list(trigrams(tokens))
        cfd = ConditionalFreqDist(((el[2], (el[0], el[1])) for el in Trigrams))

        for word in cfd.conditions():
            for bigram in cfd[word]:
                cfd[word][bigram] = round(
                    float("{0:.6f}".format(cfd[word].freq(bigram))), 6)

        self.TRANSITION_MATRIX = cfd
        return cfd

    def __viterbi(self, observations: list, emissionTable: dict,
                  transitionTable: ConditionalFreqDist):

        if not hasattr(self, 'bigramDist'):
            listcouples = []
            for tag in self.TRANSITION_MATRIX.conditions():
                for bigram in self.TRANSITION_MATRIX[tag]:
                    listcouples.append(bigram)
                    if not hasattr(self, 'tags'):
                        self.tags = []
                    if not bigram[0] in self.tags: self.tags.append(bigram[0])
                    if not bigram[1] in self.tags: self.tags.append(bigram[1])
            self.bigramDist = FreqDist(listcouples)
            for key in self.bigramDist:
                self.bigramDist[
                    key] = self.bigramDist[key] / self.bigramDist.N(
                    )  # or simply self.bigramDist.freq(key)
            print("no bigramDist.... Creating bigramDist")

        if not hasattr(self, 'initialProbabilities'):
            print("no inital distribution.... Creating initDist")
            self.initialProbabilities = FreqDist(el[1]
                                                 for el in self.bigramDist
                                                 if el[0] == '<S>')
            for tag in self.initialProbabilities:
                self.initialProbabilities[tag] = self.initialProbabilities[
                    tag] / self.initialProbabilities.N()

        N = len(self.tags)

        T = len(observations)
        viterbi = numpy.zeros((N + 2, T))

        # we remove the <S> from TAGS  because its just a sign of sentence start
        if "<S>" in self.tags:
            self.tags.remove('<S>')
            N -= 1

        backTrack = []

        for i in range(N):

            if self.tags[i] not in emissionTable:
                emissionTable[self.tags[i]] = defaultdict(float)
            viterbi[i, 0] = round(
                float("{0:.6f}".format(
                    (emissionTable[self.tags[i]][observations[0]] if
                     observations[0] in emissionTable[self.tags[i]] else 0.0) *
                    (self.initialProbabilities[self.tags[i]] if self.tags[i]
                     in self.initialProbabilities else 0.0))), 6)

        for oIndex in range(1, T):
            bestTagIndex = numpy.argmax(
                [viterbi[i, oIndex - 1] for i in range(N)])
            bestTag = self.tags[bestTagIndex]
            bestTag2 = self.tags[numpy.argmax(
                [viterbi[i, oIndex - 2]
                 for i in range(N)])] if oIndex != 1 else "<S>"
            if viterbi[bestTagIndex, oIndex - 1] == 0:
                print("Zero resulting probability. Couldn't tag ",
                      observations[oIndex - 1],
                      "Previous besttag was :",
                      bestTag2,
                      end=" ")
                if self.bacckoff is not None:
                    self.bacckoff.loadTables()
                    best = None
                    max = 0
                    for tag in self.bacckoff.TRANSITION_MATRIX[bestTag2]:

                        if self.bacckoff.TRANSITION_MATRIX[bestTag2][tag] > max:

                            max = self.bacckoff.TRANSITION_MATRIX[bestTag2][
                                tag]
                            best = tag
                            print("Best tag so fat ", best, end=" ")
                    if best is not None:
                        viterbi[
                            bestTagIndex,
                            oIndex - 1] = max * self.EMISSION_MATRIX[best][
                                observations[oIndex - 1]] if observations[
                                    oIndex -
                                    1] in self.EMISSION_MATRIX[best] else 0.0
                        bestTag = self.tags[
                            bestTagIndex]  #viterbi[bestTagIndex, oIndex - 1]

                    if viterbi[bestTagIndex, oIndex - 1] == 0:
                        if self.bacckoff.bacckoff is not None:
                            self.bacckoff.bacckoff.tagword(observations[0])

                        else:
                            viterbi[bestTagIndex, oIndex - 1] = 1
                            bestTag = "OTHER"

                else:
                    viterbi[bestTagIndex, oIndex - 1] = 1
                    bestTag = "OTHER"
                print(" Backoff tag : ", bestTag, "viterbi : ",
                      viterbi[bestTagIndex, oIndex - 1])
            #print([viterbi[i, oIndex-1] for i in range(N) ])
            backTrack.append((observations[oIndex - 1], bestTag))
            for tIndex in range(N):
                if self.tags[tIndex] == '<E>': continue

                viterbi[tIndex, oIndex] = viterbi[bestTagIndex, oIndex - 1] * \
                                          (emissionTable[self.tags[tIndex]][observations[oIndex]] if observations[oIndex] in
                                                                                                emissionTable[self.tags[
                                                                                                    tIndex]] else 0.0) * \
                                          (transitionTable[self.tags[tIndex]][(bestTag,bestTag2)] if (bestTag,bestTag2) in transitionTable[self.tags[tIndex]] else 0.0)

                # if the observation belong to another TAG then OTHER we eliminate OTHER ps: the index of the tag OTHER on TAGS array is 0
                '''if (tIndex > 0 and viterbi[tIndex, oIndex] > 0.0):
                    viterbi[0, oIndex] = 0.0;'''

        # we save the backtrack of the last Observation
        bestTagIndex = numpy.argmax([viterbi[i, T - 1] for i in range(N)])
        bestTag = self.tags[bestTagIndex]
        backTrack.append((observations[T - 1], bestTag))

        for (word, tag) in backTrack:
            if tag == 'UNKNWN':
                if self.bacckoff is None:
                    tag = "OTHER"
                else:
                    self.bacckoff.backOffPrent(backTrack)

        return backTrack

    def tagText(self, text, algorithm="Viterbi"):
        self.loadTables()

        Tokenizer = BasicTokenize()
        tokens = Tokenizer.tokenize(text)
        tokens = [self.stemmer.stem(token) for token in tokens]
        return self.__viterbi(tokens, self.EMISSION_MATRIX,
                              self.TRANSITION_MATRIX)

    def tagTokens(self, tokens: list, algorithm="Viterbi"):
        self.loadTables()
        tokens = [self.stemmer.stem(token) for token in tokens]
        return self.__viterbi(tokens, self.EMISSION_MATRIX,
                              self.TRANSITION_MATRIX)
Example #29
0
def main(download_settings_filename, parse_settings_filename,
         similarity_settings_filename):
    with open(download_settings_filename, 'r') as f:
        download_config = json.load(f)
    with open(parse_settings_filename, 'r') as f:
        parse_config = json.load(f)
    with open(similarity_settings_filename, 'r') as f:
        similarity_config = json.load(f)
    topic = download_config.get('topic', 'Medicine')
    data_dir = os.path.join(
        download_config.get('save_dir', os.path.join('data', 'wiki')), topic)
    n_pages = download_config.get('min_pages', 500)
    vocab_dir = os.path.join(
        parse_config.get('save_dir', os.path.join('artifacts', 'wiki')), topic,
        'vocab')
    save_dir = os.path.join(
        similarity_config.get('save_dir', os.path.join('artifacts', 'wiki')),
        topic, 'graph')
    vocab_top_k = similarity_config.get('vocab_top_k', 100)
    graph_top_k = similarity_config.get('graph_top_k', 10)
    metric = similarity_config.get('metric', 'euclidean')

    json_files = glob(os.path.join(vocab_dir, '*.json'))

    total_vocab_filename = os.path.join(vocab_dir, 'total_count.json')
    with open(total_vocab_filename, 'r') as f:
        total_vocab = json.load(f)

    total_freq = FreqDist(total_vocab)
    total_number_words = total_freq.N()
    most_freq_words = total_freq.most_common(vocab_top_k)
    percentage_used = 100 * sum([x[1] for x in most_freq_words
                                 ]) / total_number_words
    total_vocab_list = [x[0] for x in most_freq_words]
    all_vocabs = []
    good_json_indices = []
    print(
        'reading in preprocessed vocabulary using {:.2f}% of the total count of words'
        .format(percentage_used))
    i = -1
    for json_file in tqdm(json_files):
        i += 1
        if json_file == total_vocab_filename:
            continue
        with open(json_file, 'r') as f:
            doc_vocab = json.load(f)
            vec = create_count_vector(doc_vocab, total_vocab_list)
            if sum(vec) > 0:
                all_vocabs.append(vec)
                good_json_indices.append(i)
        if len(good_json_indices) >= n_pages:
            print('found at least {} suitable pages - breaking out of loop'.
                  format(n_pages))
            break
    good_json_indices = np.array(good_json_indices)

    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(all_vocabs)

    print('computing similarity matrix')
    similarity_matrix = pairwise_distances(tfidf, metric=metric)

    # NOTE: ignore shortest as this is always "self"
    shortest_indices = np.argsort(similarity_matrix,
                                  axis=-1)[:, 1:graph_top_k + 1]

    print('finding top-{} closest pages'.format(graph_top_k))
    pbar = tqdm(total=graph_top_k * min(shortest_indices.shape[0], n_pages))
    analysis_results = {}
    for i in range(graph_top_k):
        ith_shortest = similarity_matrix[np.arange(shortest_indices.shape[0]),
                                         shortest_indices[:, i]]
        ith_shortest_indices = shortest_indices[:, i]
        for doc_index in range(min(shortest_indices.shape[0], n_pages)):
            doc_name = os.path.basename(
                json_files[good_json_indices[doc_index]])
            doc_name = urllib.parse.unquote(doc_name[:doc_name.rfind('.')])
            if doc_name not in analysis_results:
                analysis_results[doc_name] = {"names": [], "similarities": []}
            ith_shortest_doc_name = os.path.basename(
                json_files[good_json_indices[ith_shortest_indices[doc_index]]])
            ith_shortest_doc_name = urllib.parse.unquote(
                ith_shortest_doc_name[:ith_shortest_doc_name.rfind('.')])
            analysis_results[doc_name]["names"].append(ith_shortest_doc_name)
            analysis_results[doc_name]["similarities"].append(
                ith_shortest[doc_index])
            pbar.update(1)
    pbar.close()

    os.makedirs(save_dir, exist_ok=True)
    with open(os.path.join(save_dir, 'raw_graph_info.json'), 'w') as f:
        json.dump(analysis_results, f, indent=4)
Example #30
0
class DirichletWords(object):
    def initialize_index(self):
        self.word_to_int = {}
        self.int_to_word = {}

    def __init__(self,
                 num_topics,
                 alpha_topic=1.0,
                 alpha_word=1.0,
                 max_tables=50000,
                 sanity_check=False,
                 initialize=False,
                 report_filename="topic_history.txt"):

        self.max_tables = max_tables
        self._alphabet = FreqDist()
        # store all words seen in a list so they are associated with a unique ID.

        self.initialize_index()

        self._words = FreqDist()

        self.alpha_topic = alpha_topic
        self.alpha_word = alpha_word

        self._num_updates = 0
        self._report = None
        if report_filename:
            self._report = open(report_filename, 'w')

        self.num_topics = num_topics
        self._topics = [FreqDist() for x in xrange(num_topics)]

        # the sanity_check flag is for testing only.
        if initialize and sanity_check == True:
            self.deterministic_seed()
        elif initialize:
            self.initialize_topics()

    def deterministic_seed(self):
        ''' if sanity_check = True, this will seed the topics with enough variance
    to evolve but do so in the most basic and deterministic way possible, so a
    user can follow along each step of the algorithm'''

        chars = "abcdefghijklmnopqrstuvwxyz"
        for i in xrange(3):
            word = random.choice(chars)
            self.index(word)
            topic_weights = probability_vector(self.num_topics)
            for k in xrange(self.num_topics):
                self.update_count(word, k, topic_weights[k])

    def initialize_topics(self):
        ''' initializes the topics with some random seed words so that they have
        enough relative bias to evolve when new words are passed in.  '''
        # we are going to create some random string from /dev/urandom. to convert
        # them to a string, we need a translation table that is 256 characters.
        translate_table = (string.letters * 5)[:256]
        # /dev/urandom is technically not as random as /dev/random, but it doesn't
        # block.
        r = open('/dev/urandom')
        # make random 'words' and add them to the topics. they'll never
        # realistically be seen again- which is good since we just want them to
        # seed the bias in the topics.
        for i in xrange(self.num_topics):
            word_length = random.randint(9, 20)
            word = r.read(word_length).translate(translate_table)
            self.index(word)
            topic_weights = probability_vector(self.num_topics)
            for k in xrange(self.num_topics):
                self.update_count(word, k, topic_weights[k])
        r.close()

    def __len__(self):
        return len(self._words)

    def num_words(self):
        return sum(1 for x in self._words if self._words[x] >= 1)

    def as_matrix(self):
        ''' Return a matrix of the probabilities of all words over all topics.
        note that because we are using topic_prob(), this is equivalent to he
        expectation of log beta, ie Elogbeta '''

        #  XXX TODO we should store this on the fly instead of recomputing it
        #  all the time!

        # create a numpy array here because that's what the e_step in streamLDA
        # expects

        num_words = self.num_words()
        print("%i words" % num_words)
        lambda_matrix = n.zeros((self.num_topics, num_words))

        for word_index, word in enumerate(x for x in self._words \
                                          if self._words[x] >= 1):
            topic_weights = [log(self.topic_prob(k, word)) \
                             for k in xrange(self.num_topics)]

            # topic weights for this word-- a column vector.
            lambda_matrix[:, word_index] = topic_weights

        self._num_updates += 1
        if self._report:
            self._report.write("%i %i %i %i\n" % (self._num_updates,
                                                  len(self._alphabet), \
                                                  len(self._words),
                                                  sum(x.B() for x in self._topics)))

        return lambda_matrix

    def forget(self, proportion):

        num_tables = len(self._words)
        number_to_forget = proportion * num_tables
        if num_tables > self.max_tables:
            number_to_forget += (num_tables - self.max_tables)

        # change this to weight lower probability
        tables_to_forget = random.sample(xrange(num_tables), number_to_forget)
        words = self._words.keys()

        self.initialize_index()

        word_id = -1
        for ii in words:
            word_id += 1

            if not word_id in tables_to_forget:
                self.index(ii)
                continue

            count = self._words[ii]
            for jj in self._topics:
                self._topics[jj][ii] = 0
                del self._topics[jj][ii]

            for jj in ii:
                self._chars[jj] -= count
            self._words[ii] = 0
            del self._words[ii]

    def seq_prob(self, word):
        val = 1.0

        # Weighted monkeys at typewriter
        for ii in word:
            # Add in a threshold to make sure we don't have zero probability sequences
            val *= max(self._alphabet.freq(ii), CHAR_SMOOTHING)

        # Normalize
        val /= 2**(len(word))
        return val

    def merge(self, otherlambda, rhot):
        ''' fold the word counts in another DirichletWords object into this
        one, weighted by rhot. assumes self.num_topics is the same for both
        objects. '''

        all_words = self._words.keys() + otherlambda._words.keys()
        distinct_words = list(set(all_words))

        # combines the probabilities, with otherlambda weighted by rho, and
        # generates a new count by combining the number of words in the old
        # (current) lambda with the number in the new. here we essentially take
        # the same steps as update_count but do so explicitly so we can weight the
        # terms appropriately.
        total_words = float(self._words.N() + otherlambda._words.N())

        self_scale = (1.0 - rhot) * total_words / float(self._words.N())
        other_scale = rhot * total_words / float(otherlambda._words.N())

        for word in distinct_words:
            self.index(word)

            # update word counts
            new_val = (self_scale * self._words[word] +
                       other_scale * otherlambda._words[word])
            if new_val >= 1.0:
                self._words[word] = new_val
            else:
                self._words[word] = 0
                del self._words[word]

            # update topic counts
            for topic in xrange(self.num_topics):
                new_val = (self_scale * self._topics[topic][word] +
                           other_scale * otherlambda._topics[topic][word])
                if new_val >= 1.0:
                    self._topics[topic][word] = new_val
                else:
                    self._topics[topic][word] = 0
                    del self._topics[topic][word]

        # update sequence counts
        all_chars = self._alphabet.keys() + otherlambda._alphabet.keys()
        distinct_chars = list(set(all_chars))

        for ii in distinct_chars:
            self._alphabet[ii] = (self_scale * self._alphabet[ii] +
                                  other_scale * otherlambda._alphabet[ii])

    def word_prob(self, word):
        return (self._words[word] + self.alpha_word * self.seq_prob(word)) / \
               (self._words.N() + self.alpha_word)

    def topic_prob(self, topic, word):
        return (self._topics[topic][word] + \
                self.alpha_topic * self.word_prob(word)) / \
                (self._topics[topic].N() + self.alpha_topic)

    def update_count(self, word, topic, count):
        # create an index for the word
        self.index(word)

        # increment the frequency of the word in the specified topic
        self._topics[topic][word] += count
        # also keep a separate frequency count of the number of times this word has
        # appeared, across all documents.
        self._words[word] += count
        # finally, keep track of the appearance of each character.
        # note that this does not assume any particular character set nor limit
        # recognized characters. if words contain punctuation, etc. then they will
        # be counted here.
        for ii in word:
            self._alphabet[ii] += count

    def index(self, word):
        assert not isinstance(word, int)

        if not word in self.word_to_int:
            self.word_to_int[word] = len(self.word_to_int)
            self.int_to_word[self.word_to_int[word]] = word

        return self.word_to_int[word]

    def dictionary(self, word_id):
        assert isinstance(word_id, int)

        return self.int_to_word[word_id]

    def print_probs(self, word):
        print "----------------"
        print word
        for ii in xrange(self.num_topics):
            print ii, self.topic_prob(ii, word)
        print "WORD", self.word_prob(word)
        print "SEQ", self.seq_prob(word)