Ejemplo n.º 1
0
def watson_translate(text, languages='el-en'):
    """ Translates the text using the watson API.
    Makes sure to not pass the monthly limit by updating and checking on the
    cache.
    """
    monthly_limit = 998000
    c = cache.get_item('watson_characters_remaining')
    if c is None or c.is_expired():
        c = cache.set_item('watson_characters_remaining', monthly_limit)
        c.set_expiration_date(timezone.now() + relativedelta(months=+1))

    chars_remaining = int(c.value)
    chars_to_sent = len(text)

    if chars_remaining > chars_to_sent:
        try:
            language_translator = init_watson_translator()
        except BaseException as err:
            update_log.error('Error in init_watson_translator()')
            update_log.error(err)
        else:
            translation = language_translator.translate(
                text=text, model_id=languages).get_result()
            translated_text = translation['translations'][0]['translation']

            chars_remaining -= int(translation['character_count'])
            c.set_value(chars_remaining)

            return translated_text
    else:
        update_log.warning('Watson reached the monthly limit.')
        print('Watson reached the monthly limit.')
        return None

    return None
Ejemplo n.º 2
0
def edit_article(article):
    """ Gets an article db record reference edits it and saves it.
    """
    update_log.info('Editing {}'.format(article.original_title))
    summary = summarize(article.original_text)

    if article.original_language != EN:
        translate_langs = "{}-{}".format(article.original_language, EN)
        try:
            title = translate_this(article.original_title, translate_langs)
            summary = translate_this(summary, translate_langs)
        except Exception as err:
            update_log.error(err)
            return None
    else:
        title = article.title

    if summary is not None:
        article.title = title
        html_summary = ""
        for sent in textcleaner.split_sentences(summary):
            html_summary += "<p>{}</p>".format(sent)
        article.summary = html_summary
        article.keywords = gn_keywords(summary).replace("\n", ", ")
        article.status = READY
        article.save()
        update_log.info('Editing finished successfully!')
    else:
        update_log.error('Could not finished editing the article.')

    return article
Ejemplo n.º 3
0
def parse_article(url, min_words_count=jg.MIN_WORDS_TO_SCRAPE):
    """ We download an article by ourselves so that we do it behind the Tor
    network and with a random user agent (Don't let Newspaper do it!).
    Then we fool Newspaper to think that it was the one who downloaded it so we
    can parse it and return the article.

    Returns None if the article is smaller than min_words_count.
    """

    try:
        response = get_page(url)
    except Exception as err:
        update_log.error('Error in get_page()')
        update_log.error(err)
        return None

    if response is not None:
        article = ArticleParser(url="http://something")
        article.html = response.content
        article.download_state = 2

        try:
            article.parse()
        except Exception as err:
            update_log.error('Error in article.parse()')
            update_log.error(err)
            return None
        else:
            add_url_to_blacklist(url)
            if len(article.text.split(' ')) >= min_words_count:
                return article

    return None
Ejemplo n.º 4
0
def get_articles_from_topics(topics):
    """ This is where it all starts. Scrapes articles from all URLS fount in
    the topics. Edits them and saves them to the database.
    """

    update_log.info('Testing connection.')
    test_connection()

    update_log.info('Looking for latest articles.')

    articles_urls = get_urls()

    if len(articles_urls) > 0:
        for topic_name, url_list in articles_urls.items():
            update_log.info('Fount {} new URLs to scrape for: {}'.format(
                len(url_list), topic_name))
            for url, lang in url_list:
                try:
                    article = Article.objects.get(source=url)
                    update_log.warning('Article allready exists: {}'.format(
                        article.original_title))
                except ObjectDoesNotExist:
                    article_parsed = parse_article(url)
                    if article_parsed is not None:
                        try:
                            new_article = Article.objects.create(
                                source=url,
                                topic_id=int(topics[topic_name]),
                                original_title=article_parsed.title,
                                original_text=article_parsed.text,
                                original_language=lang
                            )
                        except IntegrityError as err:
                            update_log.error('Error in saving new article.')
                            update_log.error(err)
                            continue
                        else:
                            update_log.info('Saved new article: {}'.format(
                                article_parsed.title))
                            article_editor.edit_article(new_article)
                else:
                    if article.status != READY:
                        article_editor.edit_article(new_article)

    update_log.info('Finished scraping.')

    return None
Ejemplo n.º 5
0
def get_page(url):
    """ Sends a request to the url and returns a response or None.
    Rotates random user agents every random intervals.
    """
    global CURRENT_USER_AGENT
    global REQUESTS_COUNT

    headers = {'User-Agent': CURRENT_USER_AGENT}
    session = get_tor_session()

    try:
        response = session.get(url, headers=headers)
    except InvalidSchema as e:
        update_log.error('Error in session.get()')
        update_log.error(e)
        return None
    else:
        # Use a user agent for a random amount of requests
        REQUESTS_COUNT += 1
        r_limit = random.randint(5, 10)
        if REQUESTS_COUNT >= r_limit:
            update_log.info("Changing user agent at {} requests.".format(
                REQUESTS_COUNT))

            # Make sure our new user agent is not the one we allready have.
            for a in range(0, 10):
                new_ua = get_random_user_agent()
                if new_ua != CURRENT_USER_AGENT:
                    CURRENT_USER_AGENT = get_random_user_agent()
                    REQUESTS_COUNT = 0

        if response.status_code == 200:
            return response
        else:
            update_log.warning('Got response code ({})'.format(
                response.status_code))
            return None