def process(): """Process the user input""" content = {} if "message" in request.form and ":help" in request.form["message"]: # Display instructions if the user type :help content = BOT.instructions elif "message" in request.form: # Parse the user input parser = Parser() parser_response = parser.parse(request.form["message"]) if "parsed_input" in parser_response: # Send the parsed input to the geo code api geo_code = GeoCode() geo_response = geo_code.api_request( parser_response["parsed_input"]) if "place_name_fr" in geo_response: content["map"] = geo_response content["map"].update(BOT.found_place) # Send the coordinates to wikipedia wiki_search = WikiSearch() wiki_response = wiki_search.geo_search_article( content["map"]["latitude"], content["map"]["longitude"]) if "url" in wiki_response: content["article"] = wiki_response content["article"].update(BOT.found_article) else: content = BOT.not_found else: content = BOT.parse_error return jsonify(content)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # vim:fileencoding=utf-8 from app.mailer import Mailer from app.parser import Parser from app.template import MailTemplate from app.url_fetcher import URLFetcher fetcher = URLFetcher() fetcher.fetch() if fetcher.content: parser = Parser(fetcher.content) parser.parse() if parser.has_new_content: Mailer.send(MailTemplate(fetcher.url, parser.links))
def test_parser_return_expected_string(sentence, sentence_parsed): parser = Parser(sentence) assert parser.parse() == sentence_parsed
class Classifier: """ A classifier based on calculation of distances between words. It can classify the text in forbidden/restrict (ie. 'armas', 'cigarro', 'prostituição', 'remédios', serviços) or permitted classes. # Attributes: model (KeyedVector object, None): a word2vec trained from wikipedia(portuguese) model. Otherwise, if it is 'None', the model will be trained in the initialization of the classifier. Default value: None status (str): indicates the text's level of processinf during classification. """ def __init__(self, model=None): self.parser = Parser() if model is None: #TODO download if inexistent self.model = KeyedVectors.load_word2vec_format( 'wiki.pt/wiki.pt.vec') else: self.model = model self.status = "extracting words from website" def calc_dists(self, word, kws): """ Calculates the distance between a word and a list of words based on the similarity of their word2vec representation. # Input: - word (str): The word that will be compared (based on similarity) to a list of keywords. - kws (list): List of Keywords. # Output: - dists (list): a list of distances (float) calculated between the word and each keyword in kws. """ dists = [] for kw in kws: dists += [self.model.similarity(word, kw.word)] return np.array(dists) def check_in_vocab(self, word): """ Checks whether a word is in the vocabulary of the model or not. # Input: - word (str): the word to be verified. #Output: - boolean valeu indicating if the word is part of the model's vocabulary. """ if type(word) == str: return word in self.model.wv.vocab else: return word.word in self.model.wv.vocab def rm_unseen(self, words): """ Given a list of words, returns a list of those that are part of the model's vocabulary. # Input: - words (list): A list of strings. # Output: - list of strings of word that are present in the model`s vocabulary. """ return [word for word in words if self.check_in_vocab(word)] def prepare_result(self, result, url, thresh, kw_result): """ Prepares the answer structure to be displayed in the website for the user. # Input: - result (dict): maps from label to veredict. - url (str): an url string. - thresh (float): minimum similarity for a keyword to be considered present in the content. - kw_result (dict): maps from label name to a pandas` series which maps from keywords to similarity #Output: - answer (dict): contains url, the classification, the reasons (keywords and label). """ answer = dict() answer['url'] = url max_res = (None, thresh) for label, res in result.items(): if res > max_res[1]: max_res = (label, res) restrict = max_res[0] is not None permit_ans = "not very correlated to any restrict categories" answer['restrict'] = restrict reason = "highly correlated to " + max_res[ 0].name if restrict else permit_ans other_reason = kw_result[ max_res[0].name].to_dict() if restrict else dict() other_reason = {key.word: value for key, value in other_reason.items()} answer['reasons'] = [reason, other_reason if restrict else dict()] answer['label'] = max_res[0].name if restrict else 'permitted' return answer def classify(self, url, kws, labels, dist_thresh=0.20, kws_thresh=0.49): """ Classifies an url based on the word2vec similarity of words extracted from its html content. The result is the output of the prepare_result function. # Input: - url (str): an url string. - kws (list): a list of Keywords from Keyword database. - labels (list): a list od Labels from Label database. *Optional:* - dist_thresh (float, 0.20): minimum similarity for a label to be considered present in the content. - kws_thresh (float, 0.49): minimum similarity for a keyword to be considered present in the content. """ kws = self.rm_unseen(kws) yield self.status words = self.parser.parse(url) if len(words) == 0 or len(words) == 1 and words[0] == "": yield "error" return words = self.rm_unseen(words) for label in labels: label.keywords = self.rm_unseen(label.keywords) self.status = "calculating" yield self.status dists = [] for word in words: dists += [self.calc_dists(word, kws)] dists = np.array(dists) df = pd.DataFrame(dists, columns=kws) result = dict() key_results = dict() for label in labels: key_mean = df[label.keywords].mean(axis=0) key_results[label.name] = key_mean result[label] = (key_mean > dist_thresh).mean() self.status = "formulating answer" yield self.status yield self.prepare_result(result, url, kws_thresh, key_results)