def make_model(stats_infile, picklefile, smoothingparam=0.001, min_freq=3, protocol=-1):
    """Train a probability model on a korp statistics file and save it as a pickle file.
    The model is a LidstoneProbDist (NLTK) which has tuples (wordform, MSD-tag) as keys
    and smoothed probabilities as values."""
    fdist = FreqDist()
    with open(stats_infile, encoding='utf-8') as f:
        for line in f:
            fields = line[:-1].split('\t')
            word = fields[0]
            # Skip word forms that occur fewer times than min_freq
            if int(fields[4]) < min_freq:
                break
            # Get rid of all urls
            if word.startswith("http://"):
                continue
            # # Words that only occur once may only contain letters and hyphens
            # if fields[4] == '1' and any(not (c.isalpha() or c == "-") for c in word):
            #     continue
            # if len(word) > 100:
            #     continue
            simple_msd = fields[1][:fields[1].find('.')] if '.' in fields[1] else fields[1]
            fdist[(word, simple_msd)] += int(fields[4])

    pd = LidstoneProbDist(fdist, smoothingparam, fdist.B())

    # Save probability model as pickle
    with open(picklefile, "wb") as p:
        pickle.dump(pd, p, protocol=protocol)
Example #2
0
def make_model(nst_infile, picklefile, protocol=-1):
    """ Train a POS probability model on the NST lexicon and save it as a pickle file.
    The model is a LidstoneProbDist (NLTK) which has compounded POS tags (SUC set) as keys (e.g. "NN+NN")
    and smoothed probabilities as values."""
    # Collect all compounds from nst data
    nst_full_compounds = set()
    with open(nst_infile, encoding='UTF-8') as f:
        for line in f:
            fields = line[:-1].split('\t')
            word = fields[0]
            comp = fields[3].replace("!", "")
            pos = fields[4]
            if "+" in comp and "_" not in word and not (comp.startswith("+") or comp.startswith("-")):
                nst_full_compounds.add((word, comp, pos))

    # Build POS probability model
    pos_fdist = FreqDist()
    for _w, _c, pos in nst_full_compounds:
        if '+' in pos:
            pos = re.sub(r"\+LN", "", pos)
            pos_fdist[pos] += 1

    pd = LidstoneProbDist(pos_fdist, 0.001, pos_fdist.B())

    # Save probability model as pickle
    with open(picklefile, "wb") as f:
        pickle.dump(pd, f, protocol=protocol)
Example #3
0
def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [['start0'] + [
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] + ['end0'] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence) - 1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(
            lower_case_letters) + sentence[word][letter + 1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))
def paper_title_NLP(title_corpus):

    # title_corpus is a list of tuple
    # keys like (19,1), means 2019/01
    # value is a list of paper titles after tokenized
    # referece: https://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer
    title_dict = {}
    pattern = r'''(?x)            # set flag to allow verbose regexps
            (?:[A-Z]\.)+          # abbreviations, e.g. U.S.A.
            | \w+(?:-\w+)*        # words with optional internal hyphens
            | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
            | \.\.\.              # ellipsis
            | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
            '''
    tokenizer = RegexpTokenizer(pattern)
    for t in title_corpus:
        key = (t[3], t[4])
        if key in title_dict:
            filterdText = tokenizer.tokenize(t[1])
            title_dict[key].append(filterdText)
        else:
            title_dict[key] = []
            filterdText = tokenizer.tokenize(t[1])
            title_dict[key].append(filterdText)

    # extract keywords with year span
    title_years = {}
    for k, v in title_dict.items():
        key = (k[0], )  # year index
        if key in title_years.keys():
            title_years[key].append(v)
        else:
            title_years[key] = []
            title_years[key].append(v)

    deep_freq = []
    for k, v in title_years.items():
        fd = FreqDist()
        vs = [item for sublist in v for item in sublist]
        for v_ in vs:
            for word in v_:
                fd[word] += 1

        print('The keywords for year:20{}'.format(str(k[0])))
        print("Total number of words:{}".format(str(
            fd.N())))  # total number of samples
        print("Total number of unique words:{}".format(str(
            fd.B())))  # number of bins or unique samples
        fd.pprint(50)  # The maximum number of items to display, default is 10
        deep_freq.append(fd.freq('Deep') + fd.freq('deep'))
        print(deep_freq)

    plt.plot([2012, 2013, 2014, 2015, 2016, 2017, 2018], deep_freq)
    plt.ylabel('frequency of deep word')
    plt.xlabel('years')
    plt.show()
Example #5
0
def show():
    print gutenberg.fileids()
    # 频率分布实例化
    fd = FreqDist()
    for word in gutenberg.words('austen-persuasion.txt'):
        fd[word] += 1

    print fd.N()
    print fd.B()
    # 得到前10个按频率排序后的词
    for word, value in sorted(fd.items(), key=lambda item: -item[1])[:10]:
        print word, value
Example #6
0
    def get_stop_words(input_file="data/tickets_word2vec.model",
                       threshold=0.02) -> List[str]:
        """
        Get a list of step words base on relative frequency.
        The input could either be the raw CSV file or word2vec model build with genism.
        The input format will be determined by the input_file extension <filename>.[csv|model].
        The `eval` method is a function which takes a float variable,
        word frequency, as a single argument and return a boolean value
        which represent whether a word is a stop word or not.
        By default, we consider the words within the top 2 percentile as stop words.
            >>> from canosp2020.preprocessing import Preprocess
            >>> stopwords = Preprocess.get_stop_words(input_file="data/tickets_word2vec.model", eval=lambda x: x <= 0.2)
        :param input_file: Path to tickets data csv file or genism word2vec model.
        :param eval: A function to evaluate whether a word is stop word of not
        :rtype: A list of words.
        """
        _, extension = os.path.splitext(os.path.basename(input_file))

        if extension == ".csv":
            nlp = spacy.load("en_core_web_sm")

            # Load csv file and merge title and content column
            df = pd.read_csv(input_file)
            df[TITLE_CONTENT] = df["title"] + " " + df["content"]
            df[TITLE_CONTENT].replace("", np.nan, inplace=True)
            df.dropna(subset=[TITLE_CONTENT], inplace=True)
            docs = list(
                nlp.pipe(df["title_content"],
                         disable=["tagger", "parser", "ner"]))
            sents = [[token.text for token in doc] for doc in docs]
            big_words = itertools.chain(*sents)

            # Build frequency distribution
            fdist = FreqDist(big_words)

        elif extension == ".model":
            model = Word2Vec.load(input_file)
            counter = {
                word: vocab.count
                for word, vocab in model.wv.vocab.items()
            }
            counter = dict(
                sorted(counter.items(), key=lambda x: x[1], reverse=True))
            fdist = FreqDist(counter)

        # stopwords = [word for word in fdist if eval(fdist.freq(word))]
        stopwords = [
            each[0] for each in fdist.most_common(int(threshold * fdist.B()))
        ]

        return stopwords
 def __init__(self, data, vocab=None, min=10000):
     '''
 By default, the vocabulary size (vocab) is taken to be twice the number of observed items,
 with a minimum size (min) of 10,000.
 This is somewhat ad hoc, but considering Zipf's Law, we would expect vocabulary size to be infinite.
 '''
     if type(data) == Counter or type(data) == dict:
         data = FreqDist(data)
     if vocab == None:
         vocab = max(2 * len(data), min)
     assert vocab >= data.B()
     self._freqdist = data
     self._bins = vocab
     r, nr = self._r_Nr()
     self.find_best_fit(r, nr)
     self._switch(r, nr)
     self.log_renormalise(r, nr)
Example #8
0
def save_parameters_to_file(read_filename, write_filename):
    corrected_text = read_file(read_filename, 0)
    corrected_tokens = [c for word in corrected_text for c in word]

    max_variance = 4
    extra_tokens = 2
    corrected_freq = FreqDist(corrected_tokens)
    max_decoder_seq_length = max(len(x) for x in corrected_text) + extra_tokens
    num_decoder_tokens = corrected_freq.B() + extra_tokens

    max_encoder_seq_length = max_decoder_seq_length - extra_tokens + max_variance
    num_encoder_tokens = num_decoder_tokens

    input_characters = sorted(
        set(corrected_tokens).union(chars).union({'\n', '\t'}))
    target_characters = sorted(set(corrected_tokens).union({'\n', '\t'}))
    # print(input_characters)

    input_token_index = dict([(char, i)
                              for i, char in enumerate(input_characters)])
    target_token_index = dict([(char, i)
                               for i, char in enumerate(target_characters)])

    with open(write_filename, 'w') as file:
        file.write(str(max_decoder_seq_length))
        file.write("\n")
        file.write(str(num_decoder_tokens))
        file.write("\n")
        file.write(str(max_encoder_seq_length))
        file.write("\n")
        file.write(str(num_encoder_tokens))
        file.write("\n")
        file.write(str(input_characters))
        file.write("\n")
        file.write(str(target_characters))
        file.write("\n")
        file.write(str(input_token_index))
        file.write("\n")
        file.write(str(target_token_index))
        file.write("\n")
Example #9
0
from nltk.corpus import gutenberg
from nltk import FreqDist
import matplotlib.pyplot as plt

fd = FreqDist()

for word in gutenberg.words('bible-kjv.txt'):
    fd[word] += 1

print(fd.N())

print(fd.B())

for word in list(fd.keys()):
    print(word, fd[word])

fd2 = FreqDist()
for text in gutenberg.fileids():
    for word in gutenberg.words(text):
        fd2[word] += 1

ranks = []
freqs = []

for rank, word in enumerate(fd2):
    ranks.append(rank + 1)
    freqs.append(fd2[word])

plt.loglog(ranks, freqs)
plt.xlabel('frequency(f)', fontsize=14, fontweight='bold')
plt.ylabel('rank(r)', fontsize=14, fontweight='bold')
Example #10
0
class PosNgram:
    def __init__(self, deg=1):
        self.order = deg
        self.__sentence = ""

        # storing tokens and frequency
        self.train_data = FreqDist()
        self.test_sents = None

        # to prevent from illegral argument
        if deg < 1:
            self.order = 1

    def poses2tokens(self, pos_terms, include_freq=False, default_dict=None):
        """
        # The token_terms must be the element of ngram_model
        # whose order is 1 smaller than that of the current one.
        """
        if default_dict is None:
            default_dict = self.train_data

        for (tokens, poses), freq in default_dict.items():
            if pos_terms == poses:
                yield tokens if\
                        not include_freq\
                        else (tokens, freq)

    def tokens2poses(self, token_terms, include_freq=False, default_dict=None):
        """
        # The token_terms must be the element of ngram_model
        # whose order is 1 smaller than that of the current one.
        """
        if default_dict is None:
            default_dict = self.train_data

        for (tokens, poses), freq in default_dict.items():
            if token_terms == tokens:
                yield poses if\
                        not include_freq\
                        else (poses, freq)

    def pre_process(self, file_id, training_size=90):

        start_processing = time.time()
        self.train_data = FreqDist()

        sents = gutenberg.sents(file_id)
        t_size = floor((training_size / 100) * len(sents))

        train_sents = sents[:t_size]
        self.test_sents = sents[t_size:]

        p_title = "file_id = <{}>, ngram's order = {}, split_ratio = {}-{}"
        print(
            p_title.format(file_id, self.order, training_size,
                           100 - training_size))
        with ICB('Processing...', max=len(train_sents),
                 suffix='%(percent)d%%') as bar:

            for sent in train_sents:
                bar.next()
                self.__sentence = " ".join(sent)
                self.train_data.update(self._token_pos_pairs)

        print('dict_size = {}'.format(self.train_data.B()))
        print("loading time = {}".format(time.time() - start_processing))

    def _is_subcontent(self, w1, w2):
        assert len(w1) <= len(w2)
        w1 = list(w1)
        w2 = list(w2)
        for w in w1:
            if w not in w2:
                return False
            w2.remove(w)
        return True

    def fetch_if(self, cond, term, pos_is_target=True, include_pair=False):

        tmp_freq_dist = FreqDist()

        conditions = {
            ng_prefix: ["pos[:-1] == term", "token[:-1] == term"],
            ng_suffix:
            ["pos[-len(term):] == term", "token[-len(term):] == term"],
            ng_contain: [
                "self._is_subcontent(term, pos)",
                "self._is_subcontent(term , token)"
            ],
            ng_equal: ["pos == term", "token == term"]
        }

        if cond not in conditions:
            cond = prefix

        # Fetching Choice Configuration
        p_key, t_key = "", ""
        if include_pair:
            p_key = "(pos, token)"
            t_key = "(token, pos)"
        else:
            p_key = "pos"
            t_key = "token"
        cmp_p = compile(p_key, '<string>', 'eval')
        cmp_t = compile(t_key, '<string>', 'eval')

        if pos_is_target:
            cmp_cond = compile(conditions[cond][0], '<string>', 'eval')
            for (token, pos), freq in self.train_data.items():
                if eval(cmp_cond):
                    tmp_freq_dist.update({eval(cmp_p): freq})
        else:
            cmp_cond = compile(conditions[cond][1], '<string>', 'eval')
            for (token, pos), freq in self.train_data.items():
                if eval(cmp_cond):
                    tmp_freq_dist.update({eval(cmp_t): freq})

        return tmp_freq_dist

    @property
    def _token_pos_pairs(self):
        """
        This function maps terms to POS
        (The previous version's name was phi1)
        """
        for elems in self._ngram_tokens_pos:
            poses = [elem[1] for elem in elems]

            tokens = [elem[0] for elem in elems]
            yield (tuple(tokens), tuple(poses))

    @property
    def _sent2pos_tag(self):
        sent = self.__sentence
        tokens = word_tokenize(sent)
        return pos_tag(tokens)

    @property
    def _ngram_tokens_pos(self):
        # this returns the tuples of token pos pair
        return ngrams(self._sent2pos_tag, self.order)
Example #11
0
def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t)
                                       for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist,
                                      vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1, )].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [
            cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence
        ]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) /
        float(len(bigram_length_probabilities[length]))
        for length in bigram_length_probabilities.keys()
    }
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) /
        float(len(trigram_length_probabilities[length]))
        for length in trigram_length_probabilities.keys()
    }

    random_sentences = [[
        words[random.randint(0,
                             len(words) - 1)].lower() for i in range(key)
    ] for key in bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1, )].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [
            cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence
        ]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()),
                         color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()),
                          color='blue')
    random_bigram = plt.scatter(
        list(random_bigram_length_probabilities.values()),
        list(random_bigram_length_probabilities.keys()),
        color='green')
    random_trigram = plt.scatter(
        list(random_trigram_length_probabilities.values()),
        list(random_trigram_length_probabilities.keys()),
        color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_bigram, seed, 'bigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print(
        'Given the seed word "this", the bigram model produced this text of length 30: {}'
        .format(seed))

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_trigram, seed, 'trigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print(
        'Given the seed word "this", the trigram model produced this text of length 30: {}'
        .format(seed))

    test_bigrams = []
    for sentence in bigrams_test:
        test_bigrams += sentence
    bigram_entropy, bigram_perplexity = centropy_perplexity(
        cpd_bigram, test_bigrams)
    print(
        'Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'
        .format(bigram_entropy, bigram_perplexity))

    test_trigrams = []
    for sentence in trigrams_test:
        test_trigrams += sentence
    trigram_entropy, trigram_perplexity = centropy_perplexity(
        cpd_trigram, test_trigrams)
    print(
        'Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'
        .format(trigram_entropy, trigram_perplexity))
Example #12
0
def recordStatsData(corpusname, csvwritter):
    totalFQ = FreqDist()
    processed_corpus_texts = getTextFileNames(corpusname, filtered=False)

    numcomments_pnm = 0
    numcomments_pm = 0
    numcomments_cm = 0
    numcomments_cnm = 0

    # post no mention
    if path.exists(processed_corpus_texts[0]):
        print("reading: " + processed_corpus_texts[0])
        freqs_pnm = collectFreqData(processed_corpus_texts[0])
        totalFQ = totalFQ + freqs_pnm
        junk, numcomments_pnm = collectAudienceFreqData(
            processed_corpus_texts[0])
    # post mention
    if path.exists(processed_corpus_texts[1]):
        print("reading: " + processed_corpus_texts[1])
        freqs_pm = collectFreqData(processed_corpus_texts[1])
        totalFQ = totalFQ + freqs_pm
        junk, numcomments_pm = collectAudienceFreqData(
            processed_corpus_texts[1])
    # comment no mention
    if path.exists(processed_corpus_texts[2]):
        print("reading: " + processed_corpus_texts[2])
        freqs_cnm = collectFreqData(processed_corpus_texts[2])
        totalFQ = totalFQ + freqs_cnm
        junk, numcomments_cnm = collectAudienceFreqData(
            processed_corpus_texts[2])
    # comment mention
    if path.exists(processed_corpus_texts[3]):
        print("reading: " + processed_corpus_texts[3])
        freqs_cm = collectFreqData(processed_corpus_texts[3])
        totalFQ = totalFQ + freqs_cm
        junk, numcomments_cm = collectAudienceFreqData(
            processed_corpus_texts[3])

    print("writing")

    towrite = dict()
    towrite["Subreddit"] = corpusname
    towrite["N"] = totalFQ.N()
    towrite["B"] = totalFQ.B()
    towrite[
        "Num Utterences"] = numcomments_pnm + numcomments_pm + numcomments_cm + numcomments_cnm
    towrite["Num Utterences - Post NM"] = numcomments_pnm
    towrite["Num Utterences - Post M"] = numcomments_pm
    towrite["Num Utterences - Comment"] = numcomments_cnm
    towrite["Num Utterences - Comment M"] = numcomments_cm

    if path.exists(processed_corpus_texts[0]):
        towrite["N-Post"] = freqs_pnm.N()
        towrite["B-Post"] = freqs_pnm.B()
    else:
        towrite["N-Post"] = 0
        towrite["B-Post"] = 0

    if path.exists(processed_corpus_texts[1]):
        towrite["N-Post with Mention"] = freqs_pm.N()
        towrite["B-Post with Mention"] = freqs_pm.B()
    else:
        towrite["N-Post with Mention"] = 0
        towrite["B-Post with Mention"] = 0

    if path.exists(processed_corpus_texts[2]):
        towrite["N -Comment"] = freqs_cnm.N()
        towrite["B -Comment"] = freqs_cnm.B()
    else:
        towrite["N -Comment"] = 0
        towrite["B -Comment"] = 0

    if path.exists(processed_corpus_texts[3]):
        towrite["N -Comment with Mention"] = freqs_cm.N()
        towrite["B -Comment with Mention"] = freqs_cm.B()
    else:
        towrite["N -Comment with Mention"] = 0
        towrite["B -Comment with Mention"] = 0
    csvwritter.writerow(towrite)
Example #13
0
# tokenize words and add the label, random the order and close the db
comments = [(word_tokenize(c[0]), c[1]) for c in cur]
random.shuffle(comments)
db.close()

# Gather all words from both labels
all_words = []
for c in comments:
    for w in c[0]:
        word = w
        if word[:2] != "//":
            if '*' in word:
                word = word.replace('*','')
            all_words.append(word.lower())
all_words = FreqDist(all_words)
print(all_words.B())

# Get a random set of the words to use as features
word_features = list(all_words.keys())[:4000]

# make feature sets from each comment and mark it with a label
# function returns a feature set of form {"example" : True, "word" : False}
# it will be the length of word_features
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features
featuresets = [(find_features(comment), label) for (comment, label) in comments]
Example #14
0
from nltk import FreqDist

from common.books import text1

t1 = text1()
fdist1 = FreqDist(t1)
print(fdist1)
print("moby dick has {0} words, {1} unique ones.".format(
    fdist1.N(), fdist1.B()))

voc1 = fdist1.keys()
# the type is dict_keys
# print(type(voc1))

# change back to a normal list
# voc1 = list(voc1)
# print(voc1[:50])

# i = 0
# for k in voc1:
#     if i < 50:
#         print(k)
#         i += 1
#     else:
#         break

print(fdist1.most_common(50))

print("'whale' has {0} occurences.".format(fdist1['whale']))

fdist1.plot(50, cumulative=True)
Example #15
0
#!/usr/bin/env python3

from nltk.corpus import gutenberg
from nltk import FreqDist

# Count each token in austen-persuasion.txt of the Gutenberg collection
list_of_words = gutenberg.words("austen-persuasion.txt")
fd = FreqDist(list_of_words)  # Frequency distribution object

print("Total number of tokens: " + str(fd.N()))  # number of words: 98171
print("Number of unique tokens: " + str(fd.B()))  # unique words: 6132
print("Top 10 tokens:")  # third common most token is `to`

for token, freq in fd.most_common(10):
    print(token + "\t" + str(freq))
Example #16
0
    if args.stop_punctuation:
        stoplist += [x.decode('UTF8') for x in set(list(punctuation))]
        stoplist += [u'\u201d', u'\u201c', u'\u2019', u'\u2014']
        stoplist.append('--')

    words = [word for word in word_tokenize(text) if word not in stoplist]
    if args.stem:
        st = LancasterStemmer()
        words = [st.stem(word) for word in words]

    freq_dist = FreqDist(words)

    print('Total words: ' + str(orig_freq_dist.N()))
    print('Total after filter: ' + str(freq_dist.N()))
    # B() gives list of unique words
    print('Unique words: ' + str(freq_dist.B()))
    print('Unique words ratio: ' +
          str(float(freq_dist.B()) / float(freq_dist.N())))
    print('\n')

    if args.words:
        for word in args.words:
            print(word + ': ' + str(freq_dist[word]))
            print(word + ' freq: ' + str(freq_dist.freq(word)))
            print('\n')

    # Show top 30
    print('Top ' + str(args.num_words) + ' words:')
    freq_dist.tabulate(args.num_words)
#!/local/bin/python3

from nltk.corpus import gutenberg
from nltk import FreqDist

list_of_words = gutenberg.words("austen-persuasion.txt")
fd = FreqDist(list_of_words)

print("Total number of tokens: " + str(fd.N()))  #98171
print("Number of unique tokens: " + str(fd.B()))  #6132
print("Top 10 tokens:")  #to
for token, freq in fd.most_common(10):
    print(token + "\t" + str(freq))
Example #18
0
    sWordFreq = FreqDist(word_tokenize(i))  # 每一句的词频数字典
    for j in sWordFreq:
        if j in unigramsDist:
            unigramsDist[j] += sWordFreq[j]
        else:
            unigramsDist[j] = sWordFreq[j]
# 加入未登录词
for i in testset:
    word = word_tokenize(i)  # 每一句的词频数字典
    for j in word:
        if j not in unigramsDist:
            unigramsDist[j] = 0

# 频数转化为频率  使用加一平滑法   unigramsDist.B()表示每个词都加一后的增加量

s = unigramsDist.N() + unigramsDist.B()
unigramsFreq = FreqDist()
for i in unigramsDist:
    unigramsFreq[i] = (unigramsDist[i] + 1) / s

X = sum(unigramsFreq.values())

ppt = []
for sentence in testset:
    logprob = 0
    wt = 0
    for word in word_tokenize(sentence):
        if word in unigramsFreq:
            logprob += log(unigramsFreq[word], 2)
            wt += 1
    if wt > 0:
removed_stopword_count = all_word_count - interesting_word_count
removed_stopword_percentage = round(
    (100 * removed_stopword_count) / all_word_count, 2)
print("Removed {} stopwords from the corpus ({}%)".format(
    removed_stopword_count, removed_stopword_percentage))

removed_vocab_count = all_vocab_count - interesting_vocab_count
removed_stopword_vocab_percentage = round(
    (100 * removed_vocab_count) / all_vocab_count, 2)
print("Removed {} stopwords from the vocab ({}%)".format(
    removed_vocab_count, removed_stopword_vocab_percentage))

# Get a frequency distribution for the interesting words.
fd = FreqDist(interesting_word_list)
print("Number of words: {}".format(fd.N()))
print("Number of distinct words: {}".format(fd.B()))

# The most common words.
fd.most_common(10)

freq_list = []
for word in fd.keys():
    freq_list.append([word, fd[word]])

# Sort the words by frequency, from high to low.
sorted_freq_list = sorted(freq_list, key=lambda t: t[1], reverse=True)

# Create a Words Rank Frequency list save each elements as a sublist.
rank = 1
freq_rank_list = []
for word in sorted_freq_list:
Example #20
0
from nltk.corpus import gutenberg
from nltk import FreqDist

import matplotlib
import matplotlib.pyplot as plt

fd = FreqDist()

for text in gutenberg.fileids():
    print(text, end=' ')
    for word in gutenberg.words(text):
        fd[word] += 1
    print("......done")

samples = fd.most_common()

freqs = [freq for _, freq in samples]
ranks = [i for i in range(1, fd.B() + 1)]
# print(freqs)
# print(ranks)

plt.loglog(ranks, freqs)
plt.xlabel('requency(f)', fontsize=14, fontweight='bold')
plt.ylabel('rank(r)', fontsize=14, fontweight='bold')
plt.grid(True)
plt.show()
Example #21
0
words = reduce(word_split, corpus)

#计算词频,索引
fd = FreqDist(words)

index = bidict()
pos = 0
for k, c in fd.items():
    index[k] = pos
    pos = pos + 1

#=====利用nltk的biggrams函数,建立gram矩阵==========================
grams = list(bigrams(words))

gc = np.zeros((fd.B(), fd.B()), dtype=np.int32)

#统计gram次数
for p1, p2 in grams:
    gc[index[p1], index[p2]] += 1

#统计gram概率
gp = np.zeros((fd.B(), fd.B()))


#平滑系数
ratio = 0.9

for row in range(0, fd.B()):
    for col in range(0, fd.B()):
        gp[row, col] = ratio * (gc[row, col] / fd[index.inv[row]]) + (
Example #22
0
# 都有些什么语料在这个集合里?
print(gutenberg.fileids())
# ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

# 导入 FreqDist 类
from nltk import FreqDist

# 频率分布实例化
fd = FreqDist()
# 统计文本中的词例
for word in gutenberg.words('austen-persuasion.txt'):
    fd.inc(word)

print(fd.N())  # total number of samples
# 98171
print(fd.B())  # number of bins or unique samples
# 6132
# 得到前 10 个按频率排序后的词
for word in fd.keys()[:10]:
    print(word, fd[word])

# ================运行时间计时================
run_time = time.time() - start_time
if run_time < 60:  # 两位小数的秒
    print("耗时:{:.2f}秒".format(run_time))
elif run_time < 3600:  # 分秒取整
    print("耗时:{:.0f}分{:.0f}秒".format(run_time // 60, run_time % 60))
else:  # 时分秒取整
    print("耗时:{:.0f}时{:.0f}分{:.0f}秒".format(run_time // 3600, run_time % 3600 // 60, run_time % 60))