Ejemplo n.º 1
0
                        links_descs.append(html.unescape(a.get_text().strip()))
            res_dict['news_related_url'] = links
            res_dict['news_related_url_desc'] = links_descs

    content = '\n'.join(temp_content).strip()
    if content:
        res_dict['news'] = html.unescape(content)

    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error(
            'Ettoday url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append(
            [rss_id, url])
        return
    return res_dict


content_parser = ContentParser('ETtoday')
# Query the data with source name
unprocessed_data = content_parser.content_query()

content_parser.content_processor(unprocessed_data, ettoday_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info(
    "Processed Ettoday {} examples in {} seconds".format(
        len(unprocessed_data),
        time.time() - start))
Ejemplo n.º 2
0
            prefix = title_category[0]
        else:
            prefix = ''
        content = prefix + '\n'.join(temp_content)  #.replace('。 ', '。\n')
        res_dict['news'] = html.unescape(content)
        return res_dict
    else:
        content_parser_1.logger.error(
            'Yahoo url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append(
            [rss_id, url])
        return


start = time.time()
content_parser_1 = ContentParser('Yahoo Source 1')
unprocessed_data_1 = content_parser_1.content_query()
content_parser_1.content_processor(unprocessed_data_1, yahoo_content_processor)
if content_parser_1.errors:
    content_parser_1.sent_error_email()
content_parser_1.encoding_cursor.close()
content_parser_1.mydb.close()
content_parser_1.logger.info(
    "Processed Yahoo Source 1 {} examples in {} seconds".format(
        len(unprocessed_data_1),
        time.time() - start))

start = time.time()
content_parser_2 = ContentParser('Yahoo奇摩新聞')
unprocessed_data_2 = content_parser_2.content_query()
content_parser_2.content_processor(unprocessed_data_2, yahoo_content_processor)
Ejemplo n.º 3
0
        content = article_body_tag.text.strip()
        if content:
            res_dict['news'] = html.unescape(content)
    elif article_body_tag_2:
        content = article_body_tag_2.text.strip()
        if content:
            res_dict['news'] = html.unescape(content)

    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error(
            'PTS url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append(
            [rss_id, url])
        return

    return res_dict


content_parser = ContentParser('公視新聞網')
# Query the data with source name
unprocessed_data = content_parser.content_query()

content_parser.content_processor(unprocessed_data, pts_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info("Processed PTS {} examples in {} seconds".format(
    len(unprocessed_data),
    time.time() - start))
Ejemplo n.º 4
0
            for a in a_tags:
                if len(a):
                    if a['href'] == '#':
                        continue
                    if a.get_text().strip() and 'www' in a['href']:
                        links.append(a['href'])
                        links_descs.append(html.unescape(a.get_text().strip()))
            res_dict['news_related_url'] = links
            res_dict['news_related_url_desc'] = links_descs
            
    content = '\n'.join(temp_content).strip()
    if content:
        res_dict['news'] = html.unescape(content)
    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error('RTI url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append([rss_id, url])
        return
    return res_dict



content_parser = ContentParser('Rti 中央廣播電臺')
# Query the data with source name
unprocessed_data = content_parser.content_query()
content_parser.content_processor(unprocessed_data, rti_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info("Processed RTI {} examples in {} seconds".format(len(unprocessed_data), time.time() - start))
Ejemplo n.º 5
0
                    if a.get_text().strip() and 'www' in a['href']:
                        links.append(a['href'])
                        links_descs.append(a.get_text().strip())
            res_dict['news_related_url'] = links
            res_dict['news_related_url_desc'] = links_descs
    content = '\n'.join(temp_content).strip()
    if content:
        res_dict['news'] = html.unescape(content)

    if not res_dict or 'news' not in res_dict:
        
        content_parser.logger.error('Epoch url: {} did not process properly'.format(url))
        content_parser.errors['process_error_(rss_id)'].append([rss_id, url])
        return

    return res_dict




content_parser = ContentParser('大紀元')
# Query the data with source name
unprocessed_data = content_parser.content_query()

content_parser.content_processor(unprocessed_data, epoch_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info("Processed Epoch {} examples in {} seconds".format(len(unprocessed_data), time.time() - start))
Ejemplo n.º 6
0
    # 根据url列表爬取网页
    while urlManager.has_new_url():
        try:
            new_url = urlManager.get_new_url()
            html_cont = downloader.download(new_url)
            new_data = parser.parse(new_url, html_cont)
            printer.collect_data(new_data)
        except:
            print "crawl failed!"
    printer.output_sql()


# 程序执行
if __name__ == "__main__":
    print "Welcome to EMM-Mall-ArknightDataSpider."

    # 各模块初始化(实例化)
    urlManager = UrlManager()
    downloader = Downloader()
    parser = ContentParser()
    printer = ResultPrinter()

    # 实例化主入口,开始爬取数据
    SpiderMain = SpiderMain()
    change_working_dir()
    # craw(raw_input("Enter Root Url : "))
    craw("http://prts.wiki/w/Lancet-2")

    print("Everything is done. Result is in result.sql")
            for a in a_tags:
                if len(a):
                    if a['href'] == '#':
                        continue
                    if a.get_text().strip() and 'www' in a['href']:
                        links.append(a['href'])
                        links_descs.append(html.unescape(a.get_text().strip()))
            res_dict['news_related_url'] = links
            res_dict['news_related_url_desc'] = links_descs
    content = '\n'.join(temp_content).strip()
    if content:
        res_dict['news'] = html.unescape(content)

    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error('NewsTalk url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append([rss_id, url])
        return
        
    return res_dict

content_parser = ContentParser('新頭殼要聞')
# Query the data with source name
unprocessed_data = content_parser.content_query()

content_parser.content_processor(unprocessed_data, newstalk_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info("Processed NewsTalk {} examples in {} seconds".format(len(unprocessed_data), time.time() - start))
Ejemplo n.º 8
0
                date_res = d2.strftime(db_date_format)
                res_dict['published_date'] = date_res
            except Exception as e2:
                print(e2)
                content_parser.logger.info('PChome date error {}, URL: {}'.format(e2, url))

    article_body_tag = soup.find('div', attrs = {'calss':'article_text'})
    if article_body_tag:
        content = article_body_tag.text.strip()
        a_tags = article_body_tag.find_all('a')
        if content:
            content = re.sub('(\n)+', '\n', html.unescape(content))
            content = re.sub(r'(相關新聞[\s\S]+)', '', content)
            res_dict['news'] = html.unescape(content)
            
    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error('PChome url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append([rss_id, url])
        return
        
    return res_dict
content_parser = ContentParser('PCHOME')
# Query the data with source name
unprocessed_data = content_parser.content_query()
content_parser.content_processor(unprocessed_data, pchome_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info("Processed PChome {} examples in {} seconds".format(len(unprocessed_data), time.time() - start))
Ejemplo n.º 9
0
                        links_descs.append(html.unescape(a.get_text().strip()))
            res_dict['news_related_url'] = links
            res_dict['news_related_url_desc'] = links_descs

    if len(temp_content):
        content = '\n'.join(temp_content)
        res_dict['news'] = html.unescape(content)

    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error(
            'MSN url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append(
            [rss_id, url])
        #print('MSN url: {} did not process properly'.format(url))
        return
    return res_dict


content_parser = ContentParser('MSN', 100)
# Query the data with source name
unprocessed_data = content_parser.content_query()

content_parser.content_processor(unprocessed_data, msn_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info("Processed MSN {} examples in {} seconds".format(
    len(unprocessed_data),
    time.time() - start))
Ejemplo n.º 10
0
                    if a['href'] == '#':
                        continue
                    if a.get_text().strip() and 'www' in a['href']:
                        links.append(a['href'])
                        links_descs.append(html.unescape(a.get_text().strip()))
            res_dict['news_related_url'] = links
            res_dict['news_related_url_desc'] = links_descs      
    content = '\n'.join(content_temp).strip()
    if content:
        res_dict['news'] = html.unescape(content)

    if not res_dict or 'news' not in res_dict:
        content_parser.logger.error('udn url: {} did not process properly'.format(url))
        content_parser.errors['process_empty_content_(rss_id)'].append([rss_id, url])
        return
        

    return res_dict


content_parser = ContentParser('經濟日報')
# Query the data with source name
unprocessed_data = content_parser.content_query()

content_parser.content_processor(unprocessed_data, udn_content_processor)
if content_parser.errors:
    content_parser.sent_error_email()
content_parser.encoding_cursor.close()
content_parser.mydb.close()
content_parser.logger.info("Processed UDN {} examples in {} seconds".format(len(unprocessed_data), time.time() - start))