def test_simple_program_constructor_load(self): input_program = "()" t = tokeniser(input_program) correct_output = [("PAREN", "("), ("PAREN", ")")] t.load(input_program) t.tokenise() self.assertListEqual(correct_output, t.output)
def category_wordcount(cls, data, name, weighted=False, tau=20): if weighted: cls_weights = calculate_weights(cls) # Split sentences into tokens lemmas = list( map(lambda t: list(map(lambda u: u.lower(), tokeniser(str(t)))), data)) # Get label: int mappings cls_dict = make_value_dict(cls) # Count all word occurrences prior_wc = Counter() # Count word occurrences per class per_class_wc = [Counter() for _, _ in cls_dict.items()] for i, data in enumerate(zip(lemmas, cls)): lemma, label = data lemma_counter = Counter(lemma) if weighted: for word in lemma_counter.keys(): lemma_counter[word] = lemma_counter[word] * cls_weights[label] prior_wc += lemma_counter per_class_wc[cls_dict[cls[i]]] += lemma_counter # Normalize by number of all occurrences norm_per_class_wc = normalize_wc(per_class_wc, prior_wc, tau) # Plot normalized distributions plot_dist(norm_per_class_wc, get_labels(cls_dict), name)
def transform(self, X, y=None, *args, **kwargs): # remove all non alphanumeric characters # no_non_alphanumeric_chars = map(lambda t: sub(r'[^a-zA-Z0-9]+', ' ', str(t)), X[:, 0]) # if a letter is repeated 3 times its most likely to emphasize the text so its shortened to 1 repetition no_triple_chars = map(lambda t: sub(r'(\w)\1\1*', r'\1', str(t)), X[:, 0]) weighted_questions = map(lambda t: self.isQuestion(t.lower()), no_triple_chars) tokens = np.array( list( map( lambda t: np.array([ list(map(lambda u: u.lower(), tokeniser(str(t)))), " ". join( list( map(lambda u: "a" + u[:2], self.lemmatizer.tagger(str(t))[1]))) ]), weighted_questions))) # tokenise and add taggs return np.append(tokens, X[:, 1].reshape(-1, 1), axis=1)
def get_response(self, message): req, question, options = self.state.req, self.state.question, self.state.options #closing if message.lower() == "bye": return "Goodbye!" # send input to tokenizer tokens = tokeniser(message)[1] if len(tokens) < 1: return "..?" # track of current conversation-create and update # Conversation object self.state.add_line(tokens[0]) # understand the prepositions to better find where the info is # todo submodule, for now check everything, which works pretty well tbh # at [theatre], watch|see [movie], at [time] tags = get_tags(tokens, Bot.ntm, Bot.ntt, question) # logic for what to do if there is more than one of the above, # must narrow it down # input items into the MovieRequest object based on the current # state of the tags # returns the new question that it needs to know to finish the request # returns statement, the question itself question, statement = narrow(self.state, tags, Bot.ntm, Bot.ntt) # if we are still on the same question, add to the counter # works because question is an immutable Int if self.state.question == question: self.state.timeout += 1 else: self.state.timeout = 0 self.state.question = question self.state.starting = False return statement
def test_creation_arg_load(self): input_program = "()" t = tokeniser(input_program) self.assertIsInstance(t, tokeniser)
def setUp(self): self.tokeniser = tokeniser()