コード例 #1
0
def prediction():
    """
    Function to receive an user query and return generated text

    :return: Generated text
    """

    income_query = json.loads(flask.request.data)['input']

    logging.info('Query received: {}'.format(income_query))

    lang = lang_detect(income_query)
    logging.info('Lang detected: {}'.format(lang))

    if lang == 'en':
        res = generate(income_query, size=N_TOKENS)
        generated = res.split(income_query)[-1]
    else:
        res = "lang_det_err"
        generated = "lang_det_err"

    logging.info('Result: {}'.format(res))

    return flask.jsonify({
        'version': 'v1.0.0',
        'body': {
            'query': income_query,
            'generated': generated,
            'result': res
        }
    }), 200
コード例 #2
0
 def __is_not_deleted_or_not_non_english(comment):
     if comment is not "[deleted]":
         try:
             lang = lang_detect(comment)
         except lang_detect_exception.LangDetectException:
             lang = "  "
         if lang[0] is 'e' and lang[
                 1] is 'n':  # for some reason normal string matching did not work
             return [comment, True]
     return [comment, False]
コード例 #3
0
    def _crawl_entity(self,
                      url_,
                      crawl_info,
                      crawl_store,
                      domain_depth=2,
                      page_limit=80,
                      metas_=None,
                      mode=entire,
                      page_size=PAGE_SIZE_LIMIT):
        """
        Gather information about an entity
        """
        url = None

        report = WebEntity()
        report["summary"] = []
        report["countpage"] = 0
        report["main_lang"] = ""
        report["languages"] = {}

        if not self._is_url(url_):
            self.logger.warn("[%s] is not an URL" % url_)
            # Turns report from a list to an empty histogram
            report.normalize(self.stemmer)
            return report
        else:
            url = url_

        dom = self._get_domain(url)
        report["url"] = url
        report["domain"] = dom

        self.logger.info("Launching crawl on url [%s] at depth %d" %
                         (url, domain_depth))

        # ---
        # Crawling
        # ---
        vacuum = crawler.Crawler(seedlist=[(0, url)],
                                 debug=False,
                                 proxy=self.proxy,
                                 mode=mode,
                                 max_page_size=page_size)
        for p in vacuum.crawl(proc=None,
                              domain_depth=domain_depth,
                              crawl_depth=0,
                              page_limit=page_limit,
                              wait_courtesy=0.5,
                              html2txt=False,
                              metas=None):
            lang = ""
            if p.relevant_txt is not None:
                if len(p.relevant_txt) > 0:
                    # In some cases, langdetecter has not enough features in text to
                    # detect language (Ex. : {"url":"http://nwglobalvending.be","country":"BE"})
                    try:
                        lang = lang_detect(p.relevant_txt).upper()
                    except LangDetectException:
                        self.logger.warning(
                            "Impossible to detect language in page %s" % p.url)

                # Manage f****d up languages
                if lang == "AF":
                    lang = "NL"

                # Counts lang repartition in website
                if lang in report["languages"]:
                    report["languages"][lang] += 1
                else:
                    report["languages"][lang] = 1

                report["summary"].append((lang, p.relevant_txt))

            page = CrawlData(crawl_id=crawl_info.id,
                             url=p.url,
                             domain=self._get_domain(p.url),
                             charset=p.charset,
                             http_status=p.http_status,
                             headers=p.headers,
                             depth=p.depth,
                             content_type=p.content_type,
                             crawl_date=datetime.now(),
                             title=p.title,
                             content=p.html,
                             relevant_txt=p.relevant_txt,
                             lang=lang)

            crawl_store.push_page(page)
            report["countpage"] += 1

        # Crawl is over, let's normalize website
        # Stemmer is heavy to instanciate, that's why we pass it as a reference
        report.normalize(self.stemmer)

        return report
コード例 #4
0
def language_detect(s):
    try:
        return lang_detect(s)
    except LangDetectException:
        return 'non-en'