def prediction(): """ Function to receive an user query and return generated text :return: Generated text """ income_query = json.loads(flask.request.data)['input'] logging.info('Query received: {}'.format(income_query)) lang = lang_detect(income_query) logging.info('Lang detected: {}'.format(lang)) if lang == 'en': res = generate(income_query, size=N_TOKENS) generated = res.split(income_query)[-1] else: res = "lang_det_err" generated = "lang_det_err" logging.info('Result: {}'.format(res)) return flask.jsonify({ 'version': 'v1.0.0', 'body': { 'query': income_query, 'generated': generated, 'result': res } }), 200
def __is_not_deleted_or_not_non_english(comment): if comment is not "[deleted]": try: lang = lang_detect(comment) except lang_detect_exception.LangDetectException: lang = " " if lang[0] is 'e' and lang[ 1] is 'n': # for some reason normal string matching did not work return [comment, True] return [comment, False]
def _crawl_entity(self, url_, crawl_info, crawl_store, domain_depth=2, page_limit=80, metas_=None, mode=entire, page_size=PAGE_SIZE_LIMIT): """ Gather information about an entity """ url = None report = WebEntity() report["summary"] = [] report["countpage"] = 0 report["main_lang"] = "" report["languages"] = {} if not self._is_url(url_): self.logger.warn("[%s] is not an URL" % url_) # Turns report from a list to an empty histogram report.normalize(self.stemmer) return report else: url = url_ dom = self._get_domain(url) report["url"] = url report["domain"] = dom self.logger.info("Launching crawl on url [%s] at depth %d" % (url, domain_depth)) # --- # Crawling # --- vacuum = crawler.Crawler(seedlist=[(0, url)], debug=False, proxy=self.proxy, mode=mode, max_page_size=page_size) for p in vacuum.crawl(proc=None, domain_depth=domain_depth, crawl_depth=0, page_limit=page_limit, wait_courtesy=0.5, html2txt=False, metas=None): lang = "" if p.relevant_txt is not None: if len(p.relevant_txt) > 0: # In some cases, langdetecter has not enough features in text to # detect language (Ex. : {"url":"http://nwglobalvending.be","country":"BE"}) try: lang = lang_detect(p.relevant_txt).upper() except LangDetectException: self.logger.warning( "Impossible to detect language in page %s" % p.url) # Manage f****d up languages if lang == "AF": lang = "NL" # Counts lang repartition in website if lang in report["languages"]: report["languages"][lang] += 1 else: report["languages"][lang] = 1 report["summary"].append((lang, p.relevant_txt)) page = CrawlData(crawl_id=crawl_info.id, url=p.url, domain=self._get_domain(p.url), charset=p.charset, http_status=p.http_status, headers=p.headers, depth=p.depth, content_type=p.content_type, crawl_date=datetime.now(), title=p.title, content=p.html, relevant_txt=p.relevant_txt, lang=lang) crawl_store.push_page(page) report["countpage"] += 1 # Crawl is over, let's normalize website # Stemmer is heavy to instanciate, that's why we pass it as a reference report.normalize(self.stemmer) return report
def language_detect(s): try: return lang_detect(s) except LangDetectException: return 'non-en'