Beispiel #1
0
    def __init__(self, lda_based_context):
        self.lda_based_context = lda_based_context
        self.doc =\
            Document(Constants.ITEM_TYPE + '-topic-models-nouns-complete')
        self.num_cols = 5
        self.num_topics = Constants.LDA_NUM_TOPICS
        self.rgb_tuples = None
        self.automatic_context_topic_colors = None
        self.keyword_context_topic_colors = None
        self.manual_context_topic_colors = None
        self.automatic_context_topic_ids = None
        self.keyword_context_topic_ids = None
        self.manual_context_topic_ids = None
        self.automatic_context_topic_words = None
        self.keyword_context_topic_words = None
        self.manual_context_topic_words = None
        self.headers = None
        self.topic_words_map = None
        self.table_format = '|c|' + 'c|' * (self.num_cols + 1)
        self.tagger = nltk.PerceptronTagger()
        self.tag_count_map = {'NN': 0, 'JJ': 0, 'VB': 0}

        self.init_colors()
        self.init_headers()
        self.init_topic_words()
        self.init_topic_ids()
        new_comm = UnsafeCommand(
            'newcommand', '\exampleCommand', options=4,
            extra_arguments=r'\colorbox[rgb]{#1,#2,#3}{#4} \color{black}')
        self.doc.append(new_comm)
        new_comm2 = UnsafeCommand('tiny')
        self.doc.append(new_comm2)
Beispiel #2
0
def create_vocab_directional(corpus):
    vocab = {}
    tagger = nltk.PerceptronTagger()
    # only nouns,verbs and adjectives
    with open(corpus, 'r') as f:
        for line in f:
            sentences = nltk.sent_tokenize(line)
            for sentence in sentences:
                sentence = alt_alpha(sentence).lower()
                words = sentence.split()
                pos_words = tagger.tag(words)
                words = []
                for word, pos in pos_words:
                    if pos.startswith('NN') or pos.startswith(
                            'JJ') or pos.startswith('VB'):
                        words.append(word)
                words = [
                    lemmatizer.lemmatize(word) for word in words
                    if word not in stopwords
                ]
                for word in words:
                    if word in vocab:
                        pass
                    else:
                        vocab[word] = len(vocab)
                        vocab[str(word) + '/l'] = len(vocab)
                        vocab[str(word) + '/r'] = len(vocab)
    return vocab
Beispiel #3
0
class APTaggerUtils(object):
    tagger = nltk.PerceptronTagger()

    def tag(self, tokens, tagset=None):
        tagged_tokens = APTaggerUtils.tagger.tag(tokens)
        if tagset:
            tagged_tokens = [(token, nltk.map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens]
        return tagged_tokens
Beispiel #4
0
def create_vocab_count(corpus):
    vocab_count={}
    tagger=nltk.PerceptronTagger()
    # only nouns,verbs and adjectives
    with open(corpus,'r') as f:
        for line in f:
            sentences=nltk.sent_tokenize(line)
            for sentence in sentences:
                sentence=clean_sentence(sentence).lower()
                words=sentence.split()
                pos_words=tagger.tag(words)
                words=[]
                for word,pos in pos_words:
                    if pos.startswith('NN') or pos.startswith('JJ') or pos.startswith('VB'):
                        words.append(word)
                words=[word for word in words if word not in stopwords]
                for word in words:
                    if word in vocab_count:
                        vocab_count[word]+=1
                    else:
                        vocab_count[word]=1
    return vocab_count
Beispiel #5
0
def tag_words(text, tagger=None):
    """
    Tags the words contained in the given text using part-of-speech tags. The
    text is split into sentences and it returns a list of lists with the tagged
    words. One list for every sentence.

    :param tagger: a part-of-speech tagger. This parameter is useful in order to
    avoid the initialization of the tagger every time this method is called,
    since the initialization can take a long time.
    :param text: the text to tag
    :return: a list of lists with pairs, in the form of (word, tag)
    """
    sentences = get_sentences(text)
    tokenized_sentences = [
        get_words_from_sentence(sent.lower()) for sent in sentences
    ]
    if tagger is None:
        tagger = nltk.PerceptronTagger()

    tagged_words = []
    for sent in tokenized_sentences:
        tagged_words.extend(tagger.tag(sent))
    return tagged_words
Beispiel #6
0
def create_bag_of_words(document_list):
    """
    Creates a bag of words representation of the document list given. It removes
    the punctuation and the stop words.

    :type document_list: list[str]
    :param document_list:
    :rtype: list[list[str]]
    :return:
    """
    tokenizer = RegexpTokenizer(r'\w+')
    tagger = nltk.PerceptronTagger()
    cached_stop_words = set(stopwords.words("english"))
    cached_stop_words |= {
        't', 'didn', 'doesn', 'haven', 'don', 'aren', 'isn', 've', 'll',
        'couldn', 'm', 'hasn', 'hadn', 'won', 'shouldn', 's', 'wasn', 'wouldn'
    }
    body = []
    processed = []

    for i in range(0, len(document_list)):
        body.append(document_list[i].lower())

    for entry in body:
        row = tokenizer.tokenize(entry)
        tagged_words = tagger.tag(row)

        nouns = []
        for tagged_word in tagged_words:
            if tagged_word[1].startswith('NN'):
                nouns.append(tagged_word[0])

        nouns = [word for word in nouns if word not in cached_stop_words]
        processed.append(nouns)

    return processed
Beispiel #7
0
 def __init__(self):
     super().__init__(nltk.PerceptronTagger(), self.name)
print()

start_time = time.process_time()
print("* TnT tagger (not in NLTK book)")
tnt_tagger = nltk.TnT()
tnt_tagger.train(train_sents)
print("Seen:", show_example(tnt_tagger.tag(seen_example)))
print("Unseen:", show_example(tnt_tagger.tag(unseen_example)))
print("Evaluation: {:.1%}".format(tnt_tagger.evaluate(test_sents)))
print("Elapsed time: {:.0f} s".format(time.process_time() - start_time))
print()

start_time = time.process_time()
print("* HMM tagger (not in NLTK book)")
hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents)
print("Seen:", show_example(hmm_tagger.tag(seen_example)))
print("Unseen:", show_example(hmm_tagger.tag(unseen_example)))
print("Evaluation: {:.1%}".format(hmm_tagger.evaluate(test_sents)))
print("Elapsed time: {:.0f} s".format(time.process_time() - start_time))
print()

start_time = time.process_time()
print("* Perceptron tagger (not in NLTK book)")
perp_tagger = nltk.PerceptronTagger(load=False)
perp_tagger.train(train_sents)
print("Seen:", show_example(perp_tagger.tag(seen_example)))
print("Unseen:", show_example(perp_tagger.tag(unseen_example)))
print("Evaluation: {:.1%}".format(perp_tagger.evaluate(test_sents)))
print("Elapsed time: {:.0f} s".format(time.process_time() - start_time))
print()
def perceptron_tag(train_):
    perc_tagger = nltk.PerceptronTagger(load=False)
    perc_tagger.train(train_)
    return perc_tagger
Beispiel #10
0
        Notes:
            Can raise an exception if Java Development Kit is not installed or not properly configured.

        Examples:
            >>> try:
            ...    StanfordPOSTagger.check('path/to/model', 'path/to/stanford.jar')
            ... except ValueError as e:
            ...    print(e)
            Could not find stanford-postagger.jar jar file at path/to/stanford.jar

        """
        try:
            cls(path_to_model, path_to_jar).tag(())
        except OSError as e:
            raise ValueError(
                'Either Java SDK not installed or some of the files are invalid.\n'
                + str(e))
        except LookupError as e:
            raise ValueError(str(e).strip(' =\n'))

    def __str__(self):
        return "{} (model: {})".format(self.name, self._stanford_model)


taggers = [
    POSTagger(nltk.PerceptronTagger(), 'Averaged Perceptron Tagger'),
    POSTagger(
        nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle'),
        'Treebank POS Tagger (MaxEnt)'),
]
Beispiel #11
0
# 5.1. Explain in your own words how the Averaged perceptron algorithm works

# The Averaged Perceptron predicts the tag of a word based on the information it has for the rest of the words.
# It does this using the function predict which predicts the best tag by calculating the Dot-product of the given featres.
# During training we predict a tag if this tag is correct we increase it's weights if not we dicreased the weights.
#

# 5.2. Train the averaged perceptron tagger on the Brown dataset (full stratified training dataset with 90% of the sentences).

# In[38]:

from nltk.tag.perceptron import PerceptronTagger

# In[39]:

PerceptronAv = nltk.PerceptronTagger(stratified_split_train)

# 5.3. Report on accuracy, and per tag Precision, Recall, F and confusion matrix.
#

# The PerceptronTagger uses a different set of tags (Penn TreeBank) and all our previous work uses the Universal tagset,
# so we want to map the Penn TreeBank tagset to the Universal tagset. This mapping will help us re-use previous code without change.

# In[40]:

from nltk.tag import mapping
tag_dict = mapping.tagset_mapping('en-ptb', 'universal')

# In[41]:

class TextBayes:
    """
    Naive Bayes Classifier for text.
    """

    tagger = nltk.PerceptronTagger()
    lemmatizer = nltk.WordNetLemmatizer()
    stop_words = stopwords.words('english')

    UNKNOWN_TOKEN = -1

    def __init__(self, smoothing='add-one'):
        self._smoothing = smoothing
        if self._smoothing not in [None, 'add-one']:
            raise Exception('Unknown smoothing option: {}'.format(smoothing))

        self._classes = []
        """ A list of classes that need to be distinguished """

        self._priors = {}
        """ Dictionary from class (string) to prior, sum of all priors 1 """

        self._cond_probabilities = {}
        """ Dictionary from class (string) to dictionary from token (string) to probability,
         for example: _cond_probabilities['formation']['together'] = P[token together | class formation]

         If smoothing is not None, the inner dictionaries have one extra key: UNKNOWN_TOKEN,
          which has its own probability """

    def train(self, paragraphs, classes):
        """
        :param paragraphs: A list of paragraphs (strings), where each paragraph hash a different class
        :param classes: A list, same length as x, where each entry is the class name (string) for each paragraph in x
        """
        if len(paragraphs) != len(classes):
            raise Exception(
                'Parameters `paragraphs` and `classes` should match in size ({}, {}).'
                .format(len(paragraphs), len(classes)))
        class_counts = Counter(classes)
        self._classes = list(class_counts.keys())
        for c in self._classes:
            self._priors[c] = class_counts[c] / len(classes)
        for c in self._classes:
            self._cond_probabilities[c] = {}

        # create a bag of words for each class
        word_bags = {}
        for clazz in self._classes:
            word_bags[clazz] = []
        for paragraph_i in range(len(paragraphs)):
            paragraph_strip = TextBayes.break_down(paragraphs[paragraph_i])
            clazz = classes[paragraph_i]
            word_bags[clazz].extend(paragraph_strip)

        # create a multiset for each bag of words
        for clazz in self._classes:
            word_bags[clazz] = Counter(word_bags[clazz])

        # compute conditional probability for each word in each sack
        for clazz in self._classes:
            bag_size = sum(word_bags[clazz].values())
            types_count = len(word_bags[clazz])
            for token, count in word_bags[clazz].items():
                if self._smoothing is None:
                    self._cond_probabilities[clazz][token] = count / bag_size
                else:  # add-one smoothing
                    self._cond_probabilities[clazz][token] = (count + 1) / (
                        bag_size + types_count)
            if self._smoothing == 'add-one':
                self._cond_probabilities[clazz][
                    TextBayes.UNKNOWN_TOKEN] = 1 / (bag_size + types_count)

    def conditional_probability(self, clazz, token):
        if len(self._classes) == 0:
            raise Exception('The classifier has not been trained yet.')
        if clazz not in self._classes:
            raise Exception('Unknown class.')
        if self._smoothing is None:
            return 0 if token not in self._cond_probabilities[
                clazz] else self._cond_probabilities[clazz][token]
        else:  # add-one
            return self._cond_probabilities[clazz][TextBayes.UNKNOWN_TOKEN] \
                if token not in self._cond_probabilities[clazz] else self._cond_probabilities[clazz][token]

    def predict(self, paragraph):
        """
        Calculates the most probable class that the paragraph belongs to
        :param paragraph: A string made up of one or more sentences
        :return: A prediction of the class that the paragraph belongs to
        """
        probabilities = self.belong_probabilities(paragraph)
        return argmax(probabilities)

    def belong_probabilities(self, paragraph):
        """
        :param paragraph: A string made up of one or more sentences
        :return: A dictionary from class names to probability, stating the probability of the paragraph belonging to
         each class
        """

        # To prevent underflow, we use loglikelihoods instead of likelihoods, and so we add up log-probability instead
        # of multiplying probability
        loglikelihoods = {}
        for clazz in self._classes:
            cur_likelihood = 0
            for token in TextBayes.break_down(paragraph):
                cond_probability = self.conditional_probability(clazz, token)
                cur_likelihood += log2(cond_probability)
            cur_likelihood += log2(self._priors[clazz])
            loglikelihoods[clazz] = cur_likelihood

        # Because we are interested in the ratio between the different likelihoods, we can divide all of the likelihoods
        # by a constant amount, which is the same as subtracting the loglikelihoods by a constant amount (specifically
        # we subtract by the maximum loglikelihood)
        # Then we exponentiate 2 by the new values to get normalized likelihood values
        likelihoods_normalized = {}
        max_likelihood = max(loglikelihoods.values())
        for clazz, loglike in loglikelihoods.items():
            likelihoods_normalized[clazz] = 2**(loglike - max_likelihood)

        # Compute the ratios between the likelihoods to get probabilities
        ans = {}
        sum_norm_likelihoods = sum(likelihoods_normalized.values())
        for clazz, norm_likelihood in likelihoods_normalized.items():
            ans[clazz] = norm_likelihood / sum_norm_likelihoods
        return ans

    @staticmethod
    def break_down(paragraph):
        def break_down_weak(paragraph):
            """
            Use natural language processing tools to break down the paragraph into a sequence of tokens

            :param paragraph: A string made up of one or more sentences
            :return: A list of tokens (strings) from the paragraph
            """
            tokens = word_tokenize(paragraph)
            return tokens

        def break_down_strong(paragraph):
            """
            Use natural language processing tools to break down the paragraph into a sequence of lemmatized words.
            Removes English stop words, punctuation, and numbers.

            :param paragraph: A string made up of one or more sentences
            :return: A list of words (strings) from the paragraph
            """
            tokens = word_tokenize(paragraph)
            parts_of_speech = TextBayes.tagger.tag(tokens)
            parts_of_speech = [(t[0], get_wordnet_tag(t[1]))
                               for t in parts_of_speech]
            lemmatized = [
                TextBayes.lemmatizer.lemmatize(t[0], pos=t[1])
                for t in parts_of_speech
            ]
            lowercase = [t.lower() for t in lemmatized]
            return [
                t for t in lowercase if t not in string.punctuation
                and t not in TextBayes.stop_words and not is_number(t)
            ]

        return break_down_weak(paragraph)

    @staticmethod
    def from_file(xml_file, smoothing='add-one'):
        """
        Create a TextBayes classifier from a given corpus
        :param xml_file: XML file path, that has the corpus. The file's structure: corpus > lexelt > instances
        :param smoothing: Smoothing technique for the classifier
        :return: Trained TextBayes object
        """
        instance_list = get_instance_list(xml_file)
        paragraphs = [get_paragraph(instance) for instance in instance_list]
        senses = [get_sense(instance) for instance in instance_list]

        ans = TextBayes(smoothing=smoothing)
        ans.train(paragraphs, senses)
        return ans
Beispiel #13
0
 def __call__(self, tokens):
     if self._tagger is None:
         self._tagger = nltk.PerceptronTagger()
     return [tag for _, tag in self._tagger.tag(tokens)]
Beispiel #14
0
 def __init__(self):
     NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger')
     self.normalizer = NltkNormalizer()
     self.lem = nltk.WordNetLemmatizer()
     self.tagger = nltk.PerceptronTagger()
     self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
print(f"Training data size: {len(training_data)}")
print(f"Validation data size: {len(test_data)}")
print(
    f"Expected size: {len(training_data)+len(test_data)}, actual size: {len(brown_news_tagged)}"
)

# Instantiate and train the following taggers: Unigram; TnT; Perceptron; CRF
print("\nTRAINING MODELS")

unigram = nltk.UnigramTagger(training_data)
print("Unigram Tagger trained")
TnT = nltk.TnT()
TnT.train(training_data)
print("TnT Tagger trained")
perceptron = nltk.PerceptronTagger(training_data)
print("Perceptron Tagger trained")
#CRF = nltk.CRFTagger()
#CRF.train(training_data, "model.crf.tagger")
#print("CRF Tagger trained")

# Save the trained taggers (in a LABA Taggers Map), overwrite existing
import pickle
from pickle import dump
print("\nSAVING MODELS")

ugOutput = open("Unigram.pkl", 'wb')
dump(unigram, ugOutput, -1)
ugOutput.close()
print("Trained Unigram Tagger Saved")
Beispiel #16
0
 def train(self):
     # train_data = nltk.corpus.brown.tagged_sents(categories=['news','science_fiction'])
     self.tagger = nltk.PerceptronTagger()
     self._trained = True
     return None
Beispiel #17
0
# Import packages
import nltk
import spacy
import itertools
from collections import Counter
import os
from scipy.sparse import csr_matrix, dok_matrix, save_npz
import argparse
import re
import gc
import numpy as np
import argparse
from tqdm import tqdm
import pickle
tagger = nltk.PerceptronTagger()
lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')


# Pre-procesing
def clean_sentence(sentence):
    new_sent = []
    words = sentence.split()
    words = list(itertools.chain.from_iterable([w.split(',') for w in words]))
    words = list(itertools.chain.from_iterable([w.split('-') for w in words]))
    for word in words:
        new_sent.append(''.join(w for w in word if w.isalpha()))
    return re.sub('\s\s+', r' ', ' '.join(new_sent).strip())

training = brown_tagged[int(length / 5):]
test = brown_tagged[:100]

already_trained = os.path.isfile('unigram_tagger.pkl') and os.path.isfile(
    'tnt_tagger.pkl') and os.path.isfile('perceptron_tagger.pkl')

if (not already_trained):
    # Training
    unigram = nltk.UnigramTagger(training)
    print("Trained Unigram.")

    tnt = nltk.TnT()
    tnt.train(training)
    print("Trained TnT.")

    perceptron = nltk.PerceptronTagger()
    perceptron.train(training)
    print("Trained Perceptron.")

    # CRF skipped due to lack of time to train.
    # crf = nltk.CRFTagger()
    # crf.train(training, 'model.crf.tagger')
    # print("Trained CRF.")

    # Dump trained models as files for later use.
    unigram_output = open('unigram_tagger.pkl', 'wb')
    tnt_output = open('tnt_tagger.pkl', 'wb')
    perceptron_output = open('perceptron_tagger.pkl', 'wb')

    dump(unigram, unigram_output, -1)
    unigram_output.close()
 def generateList(self, command):
     if command == '':
         command = self.command
     tags = nltk.word_tokenize(command)
     pos_tag = nltk.PerceptronTagger().tag(tags)
     return pos_tag