Beispiel #1
0
 def test_tokenise_nt(self):
     cases = {
         "Don't": [WordToken("do"), WordToken("not")],
         "hasn't": [WordToken("has"), WordToken("not")]
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #2
0
 def test_tokenise_comma(self):
     cases = {
         "I, for one.": [WordToken("i"), PunctuationToken(","), WordToken("for"), WordToken("one"),
                         PunctuationToken(".")]
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #3
0
 def test_tokenise_ve(self):
     # I've -> I have, as there is no ambiguity
     cases = {
         "I've": [WordToken("i"), WordToken("have")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #4
0
 def test_tokenise_ll(self):
     cases = {
         "I'll": [WordToken("i"), WordToken("will")],
         "Sam'll": [WordToken("sam"), WordToken("will")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #5
0
 def test_tokenise_d(self):
     # he'd -> he had / he would ambiguous, so just leave it
     cases = {
         "It'd": [WordToken("it"), WordToken("'d")],
         "He'd": [WordToken("he"), WordToken("'d")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #6
0
 def test_tokenise_bracket(self):
     cases = {
         "(I, for one.)": [PunctuationToken("("), WordToken("i"), PunctuationToken(","), WordToken("for"),
                           WordToken("one"),
                           PunctuationToken("."), PunctuationToken(")")]
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #7
0
 def test_tokenise_s(self):
     # has / genitive / is ambiguous so just leave it as is
     cases = {
         "It's": [WordToken("it"), WordToken("'s")],
         "He's": [WordToken("he"), WordToken("'s")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #8
0
 def test_tokenise_id(self):
     # B456F7-3
     cases = {
         "like B456F7-3": [WordToken("like"), WordToken("B456F7-3")],
         "like B456F7-3-like": [WordToken("like"), WordToken("B456F7-3-like")],
         "8-years-old": [WordToken("8-years-old")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #9
0
 def test_tokenise_hyphen(self):
     # "eight-year-old-child"
     # 8-year-old ? cf tokenise_id
     cases = {
         "eight-year-old child": [WordToken("eight"), PunctuationToken("-"), WordToken("year"),
                                  PunctuationToken("-"), WordToken("old"), WordToken("child")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #10
0
 def test_tokenise_capital_middle(self):
     # I thought it was GREAT. -> GREAT should be capitalised
     # I love Paris -> Paris should be capitalised
     # Paris I love -> paris + i + love (I should go lowercase)
     cases = {
         "I thought it was GREAT": [WordToken("i"), WordToken("thought"), WordToken("it"), WordToken("was"),
                                    WordToken("GREAT")],
         "I love Paris": [WordToken("i"), WordToken("love"), WordToken("Paris")],
         "Paris I love": [WordToken("paris"), WordToken("i"), WordToken("love")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #11
0
 def test_tokenise_slash(self):
     # love/hate relationship
     cases = {
         "love/hate relationship": [WordToken("love"), PunctuationToken("/"), WordToken("hate"),
                                    WordToken("relationship")],
         "this love/ hate relationship": [WordToken("this"), WordToken("love"), PunctuationToken("/"),
                                          WordToken("hate"),
                                          WordToken("relationship")],
         "weird-love /hate relationship": [WordToken("weird"), PunctuationToken("-"), WordToken("love"),
                                           PunctuationToken("/"), WordToken("hate"),
                                           WordToken("relationship")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
    def parse_text(self, text):
        """
        Parses a text and create tokens.

        Args:
            text (str): A string representing a sentence.

        Returns:
            [token]: List of word and punctuation tokens.
        """

        raw_tokens = nltk.word_tokenize(text)
        tokens = []

        for raw_token in raw_tokens:
            if raw_token in self.punctuation_mapping:
                punctuation_type = self.punctuation_mapping[raw_token]
                tokens.append(PunctuationToken(raw_token, punctuation_type))
            else:
                word_token = self.process_word(raw_token)
                if word_token is None:
                    continue
                tokens.append(WordToken(word_token))

        if self.POS_TAGGING:
            self.pos_tag(tokens)

        return tokens
Beispiel #13
0
 def test_period(self):
     # Ph.D -> Ph.D
     # U.S.A. -> U.S.A + period if end of sentence, U.S.A if middle
     cases = {
         "I have a Ph.D.": [WordToken("i"), WordToken("have"), WordToken("a"), WordToken("Ph.D"),
                            PunctuationToken(".")],
         "Make U.K. great again.": [WordToken("make"), WordToken("U.K"), PunctuationToken("."), WordToken("great"),
                                    WordToken("again"),
                                    PunctuationToken(".")]
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #14
0
    def train(self,
              training_docs,
              classes_count,
              params={
                  "smooth": 0.2,
                  "neg_scope": compute_neg_punc,
                  "bulk": False
              }):
        """all_docs: all_docs[class_index] = array of paths to a document classified as class_index"""
        self.__smooth_constant = params["smooth"]
        try:
            use_stopwords = params["stopwords"]
        except KeyError:
            use_stopwords = False
        stopwords = compute_stopwords_list()
        # p(c) -> count of documents classified as c / all docs
        # p(f|c) ->
        # count of f in document c + smooth
        # / total tokens in the document classified as c + (vocab in class c + unseen_vocab) * smooth
        # As we keep vocabulary for each class separate, have an array of dictionaries
        # vocabs[class][word] = frequency of word in the class
        count_docs_per_class = [0] * classes_count
        total_docs = len(training_docs)

        # total tokens per class
        total_tokens = [0] * classes_count

        # Array of vocabularies for each class
        # [{}] * n create a list with dictionaries of same reference, jeez
        vocabs = [{} for i in xrange(0, classes_count)]

        # Array of vocabulary size for each class
        vocab_sizes = [0] * classes_count

        # populate
        # total_tokens[i] - increment for each token
        # vocabs[i][token] - 1 if unseen, increment if seen
        # vocab_sizes - increment for each unseen token

        for tokens, label in training_docs:
            vocab = vocabs[label]
            count_docs_per_class[label] += 1
            other_label = [l for l in xrange(0, classes_count) if l != label]
            neg_array = params["neg_scope"](tokens, params["neg_words"],
                                            *params["scope_arg"])
            assert len(tokens) == len(neg_array)
            for i in xrange(0, len(tokens)):
                token = tokens[i]
                if isinstance(token, PunctuationToken):
                    continue
                if use_stopwords and token.value in stopwords:
                    continue
                negated = neg_array[i]
                if negated:
                    neg_token = token
                    token = WordToken("NOT_{}".format(token.value))
                else:
                    neg_token = WordToken("NOT_{}".format(token.value))
                freq_so_far = 0
                try:
                    freq_so_far = vocab[token]
                except KeyError:
                    vocab_sizes[label] += 1
                vocab[token] = freq_so_far + 1
                total_tokens[label] += 1
                if params["augment"]:
                    for l in other_label:
                        other_vocab = vocabs[l]
                        neg_freq_so_far = 0
                        try:
                            neg_freq_so_far = other_vocab[neg_token]
                        except KeyError:
                            vocab_sizes[l] += 1
                        other_vocab[neg_token] = neg_freq_so_far + 1
                        total_tokens[l] += 1
        p_c = map(lambda x: x / float(total_docs), count_docs_per_class)

        self.total_tokens = total_tokens
        self.vocabs = vocabs
        self.vocab_sizes = vocab_sizes
        self.classes_count = classes_count
        self.p_c = p_c
Beispiel #15
0
    def classify(self,
                 tokens,
                 params={
                     "smooth": 0.2,
                     "neg_scope": compute_neg_punc
                 }):
        try:
            use_stopwords = params["stopwords"]
        except KeyError:
            use_stopwords = False
        stopwords = compute_stopwords_list()
        best_prob = 0
        best_class = 0
        # token -> frequency in file
        vocabs_in_file = {}
        # how many unseen
        unseen_vocabs_count = [0] * self.classes_count
        # count unseen words
        neg_array = params["neg_scope"](tokens, params["neg_words"],
                                        *params["scope_arg"])
        assert len(neg_array) == len(tokens)

        for j in xrange(0, len(tokens)):
            token = tokens[j]
            if isinstance(token, PunctuationToken):
                continue
            if use_stopwords and token.value in stopwords:
                continue
            if neg_array[j]:
                token = WordToken("NOT_{}".format(token.value))
            # if we have seen this before in this document, increment frequency
            try:
                vocabs_in_file[token] += 1
            # if we haven't, then
            # - initialise its frequency in this document to 1
            # - check if it was seen in a document of class i. If not then increment unseen_vocabs_count[i]
            except KeyError:
                vocabs_in_file[token] = 1
                for i in xrange(0, self.classes_count):
                    if token not in self.vocabs[i]:
                        unseen_vocabs_count[i] += 1
        for i in xrange(0, self.classes_count):
            prob = Decimal(self.p_c[i])
            w = self.vocab_sizes[i] + unseen_vocabs_count[i]
            for j in xrange(0, len(tokens)):
                token = tokens[j]
                if neg_array[j]:
                    token = WordToken("NOT_{}".format(token.value))
                try:
                    freq_in_c = self.vocabs[i][token]
                except KeyError:
                    freq_in_c = 0
                # p(f|c) ->
                # count of f in document c + smooth
                # / total tokens in the document classified as c + (vocab in class c + unseen_vocab) * smooth
                p_f_c = Decimal(
                    (freq_in_c + self.__smooth_constant) /
                    float(self.total_tokens[i] + self.__smooth_constant * w))
                prob *= p_f_c
            if prob > best_prob:
                best_class = i
                best_prob = prob
        return best_class
Beispiel #16
0
 def test_tokenise_whitespace(self):
     cases = {
         "an    apple.": [WordToken("an"), WordToken("apple"), PunctuationToken(".")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Beispiel #17
0
 def test_tokenise_i(self):
     cases = {
         "well I think": [WordToken("well"), WordToken("i"), WordToken("think")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected