Python FreqDist.copy Examples

Programming Language: Python

Namespace/Package Name: nltk

Class/Type: FreqDist

Method/Function: copy

Examples at hotexamples.com: 5

Python FreqDist.copy - 5 examples found. These are the top rated real world Python examples of nltk.FreqDist.copy extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

keys(30)

N(30)

values(30)

update(30)

plot(30)

most_common(30)

FreqDist(30)

items(30)

freq(30)

inc(26)

hapaxes(25)

B(22)

get(22)

max(18)

iteritems(7)

pop(6)

copy(5)

tabulate(4)

samples(3)

__delitem__(2)

pformat(2)

sort_values(2)

has_key(1)

__init__(1)

transpose(1)

sort(1)

pprint(1)

reverse(1)

reset_index(1)

r_Nr(1)

_cumulative_frequencies(1)

clear(1)

elements(1)

insert(1)

viewkeys(1)

Example #1

Show file

def chapter2_exercise17():
    stopwords = stopwords.words("english")
    top_50 = FreqDist(brown.words(categories='romance'))
    temp = top_50.copy()
    for word in temp:
        if word in top_50 and word in stopwords:
            top_50.pop(word)
    return top_50.most_common(50)

Example #2

Show file

def chapter2_exercise18():
    freq_dist = FreqDist(brown.words(categories='humor'))
    stopwords_list = stopwords.words("english")
    for word in freq_dist.copy():
        if word in freq_dist and (not word.isalpha()
                                  or word in stopwords_list):
            freq_dist.pop(word)
    bigrams_dist = FreqDist([
        (item1, item2)
        for item1, item2 in nltk.bigrams(brown.words(categories='humor'))
        if item1 in freq_dist and item2 in freq_dist
    ])
    return bigrams_dist.most_common(50)

Example #3

Show file

File: vocab.py Project: sjh8205/NLP_emotok

class Vocab(object):
    def __init__(self, tokenizer=None, max_size=None, min_freq=1):
        """Basic Vocabulary object"""

        self.vocab_size = 0
        self.freqdist = FreqDist()
        self.tokenizer = tokenizer

    def update(self, max_size=None, min_freq=1):
        """
        Initialize id2word & word2id based on self.freqdist
        max_size include 4 special tokens
        """

        # {0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'}
        self.id2word = {
            PAD_ID: PAD_TOKEN,
            UNK_ID: UNK_TOKEN,
            SOS_ID: SOS_TOKEN,
            EOS_ID: EOS_TOKEN
        }
        # {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
        self.word2id = defaultdict(
            lambda: UNK_ID)  # Not in vocab => return UNK
        self.word2id.update({
            PAD_TOKEN: PAD_ID,
            UNK_TOKEN: UNK_ID,
            SOS_TOKEN: SOS_ID,
            EOS_TOKEN: EOS_ID
        })
        # self.word2id = {
        #     PAD_TOKEN: PAD_ID, UNK_TOKEN: UNK_ID,
        #     SOS_TOKEN: SOS_ID, EOS_TOKEN: EOS_ID
        # }

        vocab_size = 4
        min_freq = max(min_freq, 1)

        # Reset frequencies of special tokens
        # [...('<eos>', 0), ('<pad>', 0), ('<sos>', 0), ('<unk>', 0)]
        freqdist = self.freqdist.copy()
        special_freqdist = {
            token: freqdist[token]
            for token in [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN]
        }
        freqdist.subtract(special_freqdist)

        # Sort: by frequency, then alphabetically
        # Ex) freqdist = { 'a': 4,   'b': 5,   'c': 3 }
        #  =>   sorted = [('b', 5), ('a', 4), ('c', 3)]
        sorted_frequency_counter = sorted(freqdist.items(),
                                          key=lambda k_v: k_v[0])
        sorted_frequency_counter.sort(key=lambda k_v: k_v[1], reverse=True)

        for word, freq in sorted_frequency_counter:

            if freq < min_freq or vocab_size == max_size:
                break
            self.id2word[vocab_size] = word
            self.word2id[word] = vocab_size
            vocab_size += 1

        self.vocab_size = vocab_size

    def __len__(self):
        return len(self.id2word)

    def load(self, word2id_path=None, id2word_path=None):
        if word2id_path:
            with open(str(word2id_path), 'rb') as f:
                word2id = pickle.load(f)
            # Can't pickle lambda function
            self.word2id = defaultdict(lambda: UNK_ID)
            self.word2id.update(word2id)
            self.vocab_size = len(self.word2id)

        if id2word_path:
            with open(str(id2word_path), 'rb') as f:
                id2word = pickle.load(f)
            self.id2word = id2word

    def add_word(self, word):
        assert isinstance(word, str), 'Input should be str'
        self.freqdist.update([word])

    def add_sentence(self, sentence, tokenized=False):
        if not tokenized:
            sentence = self.tokenizer(sentence)
        for word in sentence:
            self.add_word(word)

    def add_dataframe(self, conversation_df, tokenized=True):
        for conversation in conversation_df:
            for sentence in conversation:
                self.add_sentence(sentence, tokenized=tokenized)

    def pickle(self, word2id_path, id2word_path):
        with open(str(word2id_path), 'wb') as f:
            pickle.dump(dict(self.word2id), f)

        with open(str(id2word_path), 'wb') as f:
            pickle.dump(self.id2word, f)

    def to_list(self, list_like):
        """Convert list-like containers to list"""
        if isinstance(list_like, list):
            return list_like

        if isinstance(list_like, Variable):
            return list(to_tensor(list_like).numpy())
        elif isinstance(list_like, Tensor):
            return list(list_like.numpy())

    def id2sent(self, id_list):
        """list of id => list of tokens (Single sentence)"""
        id_list = self.to_list(id_list)
        sentence = []
        for id in id_list:
            word = self.id2word[id]
            if word not in [EOS_TOKEN, SOS_TOKEN, PAD_TOKEN]:
                sentence.append(word)
            if word == EOS_TOKEN:
                break
        return sentence

    def sent2id(self, sentence, var=False):
        """list of tokens => list of id (Single sentence)"""
        id_list = [self.word2id[word] for word in sentence]
        if var:
            id_list = to_var(torch.LongTensor(id_list), eval=True)
        return id_list

    def decode(self, id_list):
        sentence = self.id2sent(id_list)
        return ' '.join(sentence)

Example #4

Show file

class Vocab(object):
    def __init__(self, tokenizer=None, max_size=None, min_freq=1):
        self.vocab_size = 0
        self.freqdist = FreqDist()
        self.tokenizer = tokenizer
        self.pad_id = PAD_ID

    def update(self, max_size=None, min_freq=1):
        self.id2word = {
            PAD_ID: PAD_TOKEN,
            UNK_ID: UNK_TOKEN,
            SOS_ID: SOS_TOKEN,
            EOS_ID: EOS_TOKEN,
            SEP_ID: SEP_TOKEN,
        }
        self.word2id = defaultdict(
            lambda: UNK_ID)  # Not in vocab => return UNK
        self.word2id.update({
            PAD_TOKEN: PAD_ID,
            UNK_TOKEN: UNK_ID,
            SOS_TOKEN: SOS_ID,
            EOS_TOKEN: EOS_ID,
            SEP_TOKEN: SEP_ID,
        })

        vocab_size = 5
        min_freq = max(min_freq, 1)

        freqdist = self.freqdist.copy()
        special_freqdist = {
            token: freqdist[token]
            for token in
            [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN, SEP_TOKEN]
        }
        freqdist.subtract(special_freqdist)

        sorted_frequency_counter = sorted(freqdist.items(),
                                          key=lambda k_v: k_v[0])
        sorted_frequency_counter.sort(key=lambda k_v: k_v[1], reverse=True)

        for word, freq in sorted_frequency_counter:
            if freq < min_freq or vocab_size == max_size:
                break
            self.id2word[vocab_size] = word
            self.word2id[word] = vocab_size
            vocab_size += 1

        self.vocab_size = vocab_size

    def __len__(self):
        return len(self.id2word)

    def load(self, word2id_path=None, id2word_path=None, ptb=False):
        if word2id_path:
            with open(word2id_path, 'rb') as f:
                word2id = pickle.load(f)
            self.word2id = defaultdict(lambda: UNK_ID)
            self.word2id.update(word2id)
            self.vocab_size = len(self.word2id)

        if id2word_path:
            with open(id2word_path, 'rb') as f:
                id2word = pickle.load(f)
            self.id2word = id2word

        if ptb:
            self.word2id['<sep>'] = self.vocab_size
            self.id2word[self.vocab_size] = '<sep>'
            self.vocab_size += 1

    def add_word(self, word):
        assert isinstance(word, str), 'Input should be str'
        self.freqdist.update([word])

    def add_sentence(self, sentence, tokenized=False):
        if not tokenized:
            sentence = self.tokenizer(sentence)
        for word in sentence:
            self.add_word(word)

    def add_dataframe(self, conversation_df, tokenized=True):
        for conversation in conversation_df:
            for sentence in conversation:
                self.add_sentence(sentence, tokenized=tokenized)

    def pickle(self, word2id_path, id2word_path):
        with open(word2id_path, 'wb') as f:
            pickle.dump(dict(self.word2id), f)

        with open(id2word_path, 'wb') as f:
            pickle.dump(self.id2word, f)

    def to_list(self, list_like):
        if isinstance(list_like, list):
            return list_like

        if isinstance(list_like, Variable):
            return list(to_tensor(list_like).numpy())
        elif isinstance(list_like, Tensor):
            return list(list_like.numpy())

    def id2sent(self, id_list):
        id_list = self.to_list(id_list)
        sentence = []
        for id in id_list:
            word = self.id2word[id]
            if word not in [EOS_TOKEN, SOS_TOKEN, PAD_TOKEN]:
                sentence.append(word)
            if word == EOS_TOKEN:
                break
        return sentence

    def sent2id(self, sentence, var=False):
        id_list = [self.word2id[word] for word in sentence]
        if var:
            id_list = to_var(torch.LongTensor(id_list), eval=True)
        return id_list

    def decode(self, id_list):
        sentence = self.id2sent(id_list)
        return ' '.join(sentence)

Example #5

Show file

class Vocab(object):
    def __init__(self, tokenizer=None, max_size=None, min_freq=1):
        """Basic Vocabulary object"""

        self.vocab_size = 0
        self.freqdist = FreqDist()
        self.tokenizer = tokenizer

    def update(self, glove_dir, max_size=None, min_freq=1):
        """
        Initialize id2word & word2id based on self.freqdist
        max_size include 4 special tokens
        """

        # {0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'}
        self.id2word = {
            PAD_ID: PAD_TOKEN, UNK_ID: UNK_TOKEN,
            SOS_ID: SOS_TOKEN, EOS_ID: EOS_TOKEN
        }
        # {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
        self.word2id = defaultdict(lambda: UNK_ID)  # Not in vocab => return UNK
        self.word2id.update({
            PAD_TOKEN: PAD_ID, UNK_TOKEN: UNK_ID,
            SOS_TOKEN: SOS_ID, EOS_TOKEN: EOS_ID
        })
        # self.word2id = {
        #     PAD_TOKEN: PAD_ID, UNK_TOKEN: UNK_ID,
        #     SOS_TOKEN: SOS_ID, EOS_TOKEN: EOS_ID
        # }

        vocab_size = 4
        min_freq = max(min_freq, 1)

        # Reset frequencies of special tokens
        # [...('<eos>', 0), ('<pad>', 0), ('<sos>', 0), ('<unk>', 0)]
        freqdist = self.freqdist.copy()
        special_freqdist = {token: freqdist[token]
                            for token in [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN]}
        freqdist.subtract(special_freqdist)

        # Sort: by frequency, then alphabetically
        # Ex) freqdist = { 'a': 4,   'b': 5,   'c': 3 }
        #  =>   sorted = [('b', 5), ('a', 4), ('c', 3)]
        sorted_frequency_counter = sorted(freqdist.items(), key=lambda k_v: k_v[0])
        sorted_frequency_counter.sort(key=lambda k_v: k_v[1], reverse=True)

        # Load glove vector
        word_emb_dict = self.get_glove_emb(glove_dir)

        for word, freq in sorted_frequency_counter:

            if freq < min_freq or vocab_size == max_size:
                break
            self.id2word[vocab_size] = word
            self.word2id[word] = vocab_size
            vocab_size += 1

        self.vocab_size = vocab_size


        # Create embedding matrix
        self.embedding_matrix = embedding_matrix = np.zeros((self.vocab_size, 300))

        for word, ind in self.word2id.items():
            if word.lower() in word_emb_dict:
                embedding_matrix[self.word2id[word]] = word_emb_dict[word.lower()]
            else:
                embedding_matrix[self.word2id[word]] = np.random.uniform(-0.25, 0.25, 300)

    def get_glove_emb(self, GLOVE_DIR):
        embeddings_index = {}
        f = open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'), 'rb')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word.decode().lower()] = coefs
        f.close()
        return embeddings_index


    def __len__(self):
        return len(self.id2word)


    def load(self, word2id_path=None, id2word_path=None, word_emb_path=None):
        if word2id_path:
            with open(word2id_path, 'rb') as f:
                word2id = pickle.load(f)
            # Can't pickle lambda function
            self.word2id = defaultdict(lambda: UNK_ID)
            self.word2id.update(word2id)
            self.vocab_size = len(self.word2id)

        if id2word_path:
            with open(id2word_path, 'rb') as f:
                id2word = pickle.load(f)
            self.id2word = id2word
        
        if word_emb_path:
            with open(word_emb_path, 'rb') as f:
                embedding_matrix = pickle.load(f)
            self.embedding_matrix = embedding_matrix

    def add_word(self, word):
        assert isinstance(word, str), 'Input should be str'
        self.freqdist.update([word])

    def add_sentence(self, sentence, tokenized=False):
        if not tokenized:
            sentence = self.tokenizer(sentence)
        for word in sentence:
            self.add_word(word)

    def add_dataframe(self, conversation_df, tokenized=True):
        for conversation in conversation_df:
            for sentence in conversation:
                self.add_sentence(sentence, tokenized=tokenized)

    def pickle(self, word2id_path, id2word_path, word_emb_path):
        with open(word2id_path, 'wb') as f:
            pickle.dump(dict(self.word2id), f)

        with open(id2word_path, 'wb') as f:
            pickle.dump(self.id2word, f)

        with open(word_emb_path, 'wb') as f:
            pickle.dump(self.embedding_matrix, f)

    def to_list(self, list_like):
        """Convert list-like containers to list"""
        if isinstance(list_like, list):
            return list_like

        if isinstance(list_like, Variable):
            return list(to_tensor(list_like).numpy())
        elif isinstance(list_like, Tensor):
            return list(list_like.numpy())

    def id2sent(self, id_list):
        """list of id => list of tokens (Single sentence)"""
        id_list = self.to_list(id_list)
        sentence = []
        for id in id_list:
            word = self.id2word[id]
            if word not in [EOS_TOKEN, SOS_TOKEN, PAD_TOKEN]:
                sentence.append(word)
            if word == EOS_TOKEN:
                break
        return sentence

    def sent2id(self, sentence, var=False):
        """list of tokens => list of id (Single sentence)"""
        id_list = [self.word2id[word] for word in sentence]
        if var:
            id_list = to_var(torch.LongTensor(id_list), eval=True)
        return id_list

    def decode(self, id_list):
        sentence = self.id2sent(id_list)
        return ' '.join(sentence)