Exemple #1
0
 def test_simple_program_constructor_load(self):
     input_program = "()"
     t = tokeniser(input_program)
     correct_output = [("PAREN", "("), ("PAREN", ")")]
     t.load(input_program)
     t.tokenise()
     self.assertListEqual(correct_output, t.output)
Exemple #2
0
def category_wordcount(cls, data, name, weighted=False, tau=20):

    if weighted:
        cls_weights = calculate_weights(cls)

    # Split sentences into tokens
    lemmas = list(
        map(lambda t: list(map(lambda u: u.lower(), tokeniser(str(t)))), data))

    # Get label: int mappings
    cls_dict = make_value_dict(cls)

    # Count all word occurrences
    prior_wc = Counter()
    # Count word occurrences per class
    per_class_wc = [Counter() for _, _ in cls_dict.items()]
    for i, data in enumerate(zip(lemmas, cls)):
        lemma, label = data
        lemma_counter = Counter(lemma)
        if weighted:
            for word in lemma_counter.keys():
                lemma_counter[word] = lemma_counter[word] * cls_weights[label]
        prior_wc += lemma_counter
        per_class_wc[cls_dict[cls[i]]] += lemma_counter

    # Normalize by number of all occurrences
    norm_per_class_wc = normalize_wc(per_class_wc, prior_wc, tau)

    # Plot normalized distributions
    plot_dist(norm_per_class_wc, get_labels(cls_dict), name)
Exemple #3
0
    def transform(self, X, y=None, *args, **kwargs):
        # remove all non alphanumeric characters
        # no_non_alphanumeric_chars = map(lambda t: sub(r'[^a-zA-Z0-9]+', ' ', str(t)), X[:, 0])
        # if a letter is repeated 3 times its most likely to emphasize the text so its shortened to 1 repetition
        no_triple_chars = map(lambda t: sub(r'(\w)\1\1*', r'\1', str(t)), X[:,
                                                                            0])

        weighted_questions = map(lambda t: self.isQuestion(t.lower()),
                                 no_triple_chars)

        tokens = np.array(
            list(
                map(
                    lambda t: np.array([
                        list(map(lambda u: u.lower(), tokeniser(str(t)))), " ".
                        join(
                            list(
                                map(lambda u: "a" + u[:2],
                                    self.lemmatizer.tagger(str(t))[1])))
                    ]), weighted_questions)))  # tokenise and add taggs
        return np.append(tokens, X[:, 1].reshape(-1, 1), axis=1)
    def get_response(self, message):
        req, question, options = self.state.req, self.state.question, self.state.options

        #closing
        if message.lower() == "bye":
            return "Goodbye!"

        # send input to tokenizer
        tokens = tokeniser(message)[1]

        if len(tokens) < 1: return "..?"

        #  track of current conversation-create and update
        # Conversation object
        self.state.add_line(tokens[0])

        # understand the prepositions to better find where the info is
        # todo submodule, for now check everything, which works pretty well tbh
        # at [theatre], watch|see [movie], at [time]

        tags = get_tags(tokens, Bot.ntm, Bot.ntt, question)

        # logic for what to do if there is more than one of the above,
        # must narrow it down
        # input items into the MovieRequest object based on the current
        # state of the tags
        # returns the new question that it needs to know to finish the request
        # returns statement, the question itself
        question, statement = narrow(self.state, tags, Bot.ntm, Bot.ntt)

        # if we are still on the same question, add to the counter
        # works because question is an immutable Int
        if self.state.question == question:
            self.state.timeout += 1
        else: self.state.timeout = 0
        self.state.question = question

        self.state.starting = False
        return statement
Exemple #5
0
 def test_creation_arg_load(self):
     input_program = "()"
     t = tokeniser(input_program)
     self.assertIsInstance(t, tokeniser)
Exemple #6
0
 def setUp(self):
     self.tokeniser = tokeniser()