Ejemplo n.º 1
0
                        links_descs.append(html.unescape(a.get_text().strip()))
            res_dict['news_related_url'] = links
            res_dict['news_related_url_desc'] = links_descs

    content = '\n'.join(temp_content).strip()
    if content:
        res_dict['news'] = html.unescape(content)

    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error(
            'Ettoday url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append(
            [rss_id, url])
        return
    return res_dict


content_parser = ContentParser('ETtoday')
# Query the data with source name
unprocessed_data = content_parser.content_query()

content_parser.content_processor(unprocessed_data, ettoday_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info(
    "Processed Ettoday {} examples in {} seconds".format(
        len(unprocessed_data),
        time.time() - start))
Ejemplo n.º 2
0
        content = article_body_tag.text.strip()
        if content:
            res_dict['news'] = html.unescape(content)
    elif article_body_tag_2:
        content = article_body_tag_2.text.strip()
        if content:
            res_dict['news'] = html.unescape(content)

    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error(
            'PTS url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append(
            [rss_id, url])
        return

    return res_dict


content_parser = ContentParser('公視新聞網')
# Query the data with source name
unprocessed_data = content_parser.content_query()

content_parser.content_processor(unprocessed_data, pts_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info("Processed PTS {} examples in {} seconds".format(
    len(unprocessed_data),
    time.time() - start))
Ejemplo n.º 3
0
            prefix = ''
        content = prefix + '\n'.join(temp_content)  #.replace('。 ', '。\n')
        res_dict['news'] = html.unescape(content)
        return res_dict
    else:
        content_parser_1.logger.error(
            'Yahoo url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append(
            [rss_id, url])
        return


start = time.time()
content_parser_1 = ContentParser('Yahoo Source 1')
unprocessed_data_1 = content_parser_1.content_query()
content_parser_1.content_processor(unprocessed_data_1, yahoo_content_processor)
if content_parser_1.errors:
    content_parser_1.sent_error_email()
content_parser_1.encoding_cursor.close()
content_parser_1.mydb.close()
content_parser_1.logger.info(
    "Processed Yahoo Source 1 {} examples in {} seconds".format(
        len(unprocessed_data_1),
        time.time() - start))

start = time.time()
content_parser_2 = ContentParser('Yahoo奇摩新聞')
unprocessed_data_2 = content_parser_2.content_query()
content_parser_2.content_processor(unprocessed_data_2, yahoo_content_processor)
if content_parser_2.errors:
    content_parser_2.sent_error_email()
Ejemplo n.º 4
0
                    if a.get_text().strip() and 'www' in a['href']:
                        links.append(a['href'])
                        links_descs.append(a.get_text().strip())
            res_dict['news_related_url'] = links
            res_dict['news_related_url_desc'] = links_descs
    content = '\n'.join(temp_content).strip()
    if content:
        res_dict['news'] = html.unescape(content)

    if not res_dict or 'news' not in res_dict:
        
        content_parser.logger.error('Epoch url: {} did not process properly'.format(url))
        content_parser.errors['process_error_(rss_id)'].append([rss_id, url])
        return

    return res_dict




content_parser = ContentParser('大紀元')
# Query the data with source name
unprocessed_data = content_parser.content_query()

content_parser.content_processor(unprocessed_data, epoch_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info("Processed Epoch {} examples in {} seconds".format(len(unprocessed_data), time.time() - start))
            for a in a_tags:
                if len(a):
                    if a['href'] == '#':
                        continue
                    if a.get_text().strip() and 'www' in a['href']:
                        links.append(a['href'])
                        links_descs.append(html.unescape(a.get_text().strip()))
            res_dict['news_related_url'] = links
            res_dict['news_related_url_desc'] = links_descs
    content = '\n'.join(temp_content).strip()
    if content:
        res_dict['news'] = html.unescape(content)

    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error('NewsTalk url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append([rss_id, url])
        return
        
    return res_dict

content_parser = ContentParser('新頭殼要聞')
# Query the data with source name
unprocessed_data = content_parser.content_query()

content_parser.content_processor(unprocessed_data, newstalk_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info("Processed NewsTalk {} examples in {} seconds".format(len(unprocessed_data), time.time() - start))
Ejemplo n.º 6
0
                date_res = d2.strftime(db_date_format)
                res_dict['published_date'] = date_res
            except Exception as e2:
                print(e2)
                content_parser.logger.info('PChome date error {}, URL: {}'.format(e2, url))

    article_body_tag = soup.find('div', attrs = {'calss':'article_text'})
    if article_body_tag:
        content = article_body_tag.text.strip()
        a_tags = article_body_tag.find_all('a')
        if content:
            content = re.sub('(\n)+', '\n', html.unescape(content))
            content = re.sub(r'(相關新聞[\s\S]+)', '', content)
            res_dict['news'] = html.unescape(content)
            
    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error('PChome url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append([rss_id, url])
        return
        
    return res_dict
content_parser = ContentParser('PCHOME')
# Query the data with source name
unprocessed_data = content_parser.content_query()
content_parser.content_processor(unprocessed_data, pchome_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info("Processed PChome {} examples in {} seconds".format(len(unprocessed_data), time.time() - start))