Exemple #1
0
def watson_translate(text, languages='el-en'):
    """ Translates the text using the watson API.
    Makes sure to not pass the monthly limit by updating and checking on the
    cache.
    """
    monthly_limit = 998000
    c = cache.get_item('watson_characters_remaining')
    if c is None or c.is_expired():
        c = cache.set_item('watson_characters_remaining', monthly_limit)
        c.set_expiration_date(timezone.now() + relativedelta(months=+1))

    chars_remaining = int(c.value)
    chars_to_sent = len(text)

    if chars_remaining > chars_to_sent:
        try:
            language_translator = init_watson_translator()
        except BaseException as err:
            update_log.error('Error in init_watson_translator()')
            update_log.error(err)
        else:
            translation = language_translator.translate(
                text=text, model_id=languages).get_result()
            translated_text = translation['translations'][0]['translation']

            chars_remaining -= int(translation['character_count'])
            c.set_value(chars_remaining)

            return translated_text
    else:
        update_log.warning('Watson reached the monthly limit.')
        print('Watson reached the monthly limit.')
        return None

    return None
Exemple #2
0
 def translate(sentence):
     params = {"q": sentence, "langpair": langpair}
     session = get_tor_session()
     response_object = session.post(url, params)
     response = json.loads(response_object.text)
     if response['responseStatus'] != 200:
         update_log.warning('MyMemory responded with {}'.format(
             response['responseStatus']))
     else:
         return response['responseData']['translatedText']
Exemple #3
0
def get_articles_from_topics(topics):
    """ This is where it all starts. Scrapes articles from all URLS fount in
    the topics. Edits them and saves them to the database.
    """

    update_log.info('Testing connection.')
    test_connection()

    update_log.info('Looking for latest articles.')

    articles_urls = get_urls()

    if len(articles_urls) > 0:
        for topic_name, url_list in articles_urls.items():
            update_log.info('Fount {} new URLs to scrape for: {}'.format(
                len(url_list), topic_name))
            for url, lang in url_list:
                try:
                    article = Article.objects.get(source=url)
                    update_log.warning('Article allready exists: {}'.format(
                        article.original_title))
                except ObjectDoesNotExist:
                    article_parsed = parse_article(url)
                    if article_parsed is not None:
                        try:
                            new_article = Article.objects.create(
                                source=url,
                                topic_id=int(topics[topic_name]),
                                original_title=article_parsed.title,
                                original_text=article_parsed.text,
                                original_language=lang
                            )
                        except IntegrityError as err:
                            update_log.error('Error in saving new article.')
                            update_log.error(err)
                            continue
                        else:
                            update_log.info('Saved new article: {}'.format(
                                article_parsed.title))
                            article_editor.edit_article(new_article)
                else:
                    if article.status != READY:
                        article_editor.edit_article(new_article)

    update_log.info('Finished scraping.')

    return None
Exemple #4
0
def mymemory_translate(text, languages="el-en"):
    daily_limit = 1000
    c = cache.get_item('mymemory_words_remaining')
    if c is None or c.is_expired():
        c = cache.set_item('mymemory_words_remaining', daily_limit)
        c.set_expiration_date(timezone.now() + relativedelta(days=+1))

    url = "http://api.mymemory.translated.net/get"
    langpair = languages.replace("-", "|")

    words_to_send = len(re.findall(r'\w+', text))
    words_remaining = int(c.value)
    print('==> Words to send:')
    print(words_to_send)

    def translate(sentence):
        params = {"q": sentence, "langpair": langpair}
        session = get_tor_session()
        response_object = session.post(url, params)
        response = json.loads(response_object.text)
        if response['responseStatus'] != 200:
            update_log.warning('MyMemory responded with {}'.format(
                response['responseStatus']))
        else:
            return response['responseData']['translatedText']

    if words_remaining > words_to_send:
        translated_text = ""
        # The limit of characters for each request is 500
        if len(text) > 500:
            sentences = textcleaner.split_sentences(text)
            for sent in sentences:
                sentence = translate(sent)
                if sent is not None:
                    translated_text += sentence + "\r\n"
                else:
                    return None
        else:
            translated_text = translate(text)

        words_remaining -= int(words_to_send)
        c.set_value(words_remaining)
        return translated_text
    else:
        update_log.warning('MyMemory reached the daily limit.')
        return None
Exemple #5
0
def get_page(url):
    """ Sends a request to the url and returns a response or None.
    Rotates random user agents every random intervals.
    """
    global CURRENT_USER_AGENT
    global REQUESTS_COUNT

    headers = {'User-Agent': CURRENT_USER_AGENT}
    session = get_tor_session()

    try:
        response = session.get(url, headers=headers)
    except InvalidSchema as e:
        update_log.error('Error in session.get()')
        update_log.error(e)
        return None
    else:
        # Use a user agent for a random amount of requests
        REQUESTS_COUNT += 1
        r_limit = random.randint(5, 10)
        if REQUESTS_COUNT >= r_limit:
            update_log.info("Changing user agent at {} requests.".format(
                REQUESTS_COUNT))

            # Make sure our new user agent is not the one we allready have.
            for a in range(0, 10):
                new_ua = get_random_user_agent()
                if new_ua != CURRENT_USER_AGENT:
                    CURRENT_USER_AGENT = get_random_user_agent()
                    REQUESTS_COUNT = 0

        if response.status_code == 200:
            return response
        else:
            update_log.warning('Got response code ({})'.format(
                response.status_code))
            return None
Exemple #6
0
def get_urls_from_source(source):
    """ Returns a set of tuples: {(url, lang), (url, lang) ...} so that every
    url-language combination is unique.
    Checks the url_blacklist so we don't scrape a URL more than once.
    """

    urls = set()

    blacklist = cache.get_item('url_blacklist')
    if blacklist is None:
        blacklist = cache.set_item('url_blacklist', "")

    update_log.info('Checking {}'.format(source['root_url']))
    response = get_page(source['root_url'])
    if response is not None:
        soup = None
        if source['root_url'].endswith('.xml'):

            # XML Parsing =====================================================
            xml = response.text
            print('==> Response:')
            print(xml)
            soup = BeautifulSoup(xml)
            if soup is not None:
                links = soup.find_all('loc')
                if len(links) == 0:
                    update_log.warning('No links were fount in this page.')
                for link in links:
                    link_text = link.text
                    print(link_text)
                    if (
                        link_text.startswith(source['url_filter']) and
                        link_text > source['url_filter'] and
                        link_text not in blacklist.value
                    ):
                        urls.add((link_text, source['language']))
            else:
                update_log.warning('Cannot parse this page.')
        else:

            # HTML parsing ====================================================
            soup = BeautifulSoup(response.content, 'html.parser')
            if soup is not None:
                links = soup.find_all('a', href=True)
                if len(links) == 0:
                    update_log.warning('No links were fount in this page.')
                for link in links:
                    if (
                        link['href'].startswith(source['url_filter']) and
                        link['href'] > source['url_filter'] and
                        link['href'] not in blacklist.value
                    ):
                        urls.add((link['href'], source['language']))
            else:
                update_log.warning('Cannot parse this page.')

    if len(urls) == 0:
        update_log.warning('Fount nothing new in {}'.format(
            source['root_url']))

    return urls