Ejemplo n.º 1
0
                        links_descs.append(html.unescape(a.get_text().strip()))
            res_dict['news_related_url'] = links
            res_dict['news_related_url_desc'] = links_descs

    content = '\n'.join(temp_content).strip()
    if content:
        res_dict['news'] = html.unescape(content)

    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error(
            'Ettoday url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append(
            [rss_id, url])
        return
    return res_dict


content_parser = ContentParser('ETtoday')
# Query the data with source name
unprocessed_data = content_parser.content_query()

content_parser.content_processor(unprocessed_data, ettoday_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info(
    "Processed Ettoday {} examples in {} seconds".format(
        len(unprocessed_data),
        time.time() - start))
Ejemplo n.º 2
0
        else:
            prefix = ''
        content = prefix + '\n'.join(temp_content)  #.replace('。 ', '。\n')
        res_dict['news'] = html.unescape(content)
        return res_dict
    else:
        content_parser_1.logger.error(
            'Yahoo url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append(
            [rss_id, url])
        return


start = time.time()
content_parser_1 = ContentParser('Yahoo Source 1')
unprocessed_data_1 = content_parser_1.content_query()
content_parser_1.content_processor(unprocessed_data_1, yahoo_content_processor)
if content_parser_1.errors:
    content_parser_1.sent_error_email()
content_parser_1.encoding_cursor.close()
content_parser_1.mydb.close()
content_parser_1.logger.info(
    "Processed Yahoo Source 1 {} examples in {} seconds".format(
        len(unprocessed_data_1),
        time.time() - start))

start = time.time()
content_parser_2 = ContentParser('Yahoo奇摩新聞')
unprocessed_data_2 = content_parser_2.content_query()
content_parser_2.content_processor(unprocessed_data_2, yahoo_content_processor)
if content_parser_2.errors: