Exemple #1
0
def expand():
    #init crawler
    beian_links_crawler = beian_links.BeianLinksCrawler()
    icp_chinaz_crawler = icp_chinaz.IcpchinazCrawler()
    screenshot_crawler = screenshot_website.phantomjsScreenshot()
    download_crawler_itjuzi = download.DownloadCrawler(max_crawl=200,
                                                       timeout=10)
    download_crawler_kr36 = download.DownloadCrawler(use_proxy=False)
    download_crawler_lagou = download.DownloadCrawler(use_proxy=True)
    download_crawler = download.DownloadCrawler()
    while True:
        # gevent -> list of source_companies

        if len(COMPANIES) == 0:
            return
        sc = COMPANIES.pop(0)
        source = sc["source"]
        sourceId = sc["sourceId"]

        # company_info_expand_mongo.expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler)

        if source == 13030:
            diff_sourceCompanyId = check_expand_diff.check_diff(
                source, sourceId, download_crawler_itjuzi)
        elif source == 13020:
            diff_sourceCompanyId = check_expand_diff.check_diff(
                source, sourceId, download_crawler_kr36)
        elif source == 13050:
            diff_sourceCompanyId = check_expand_diff.check_diff(
                source, sourceId, download_crawler_lagou)
        else:
            diff_sourceCompanyId = check_expand_diff.check_diff(
                source, sourceId, download_crawler)
        logger.info("Source: %s, sourceId: %s, Diff: %s", source, sourceId,
                    diff_sourceCompanyId)
        #Set processStatus in mysql and mongo
        mongo = db.connect_mongo()
        collection_source_company = mongo.source.company
        collection_source_company.update_one(
            {
                "source": source,
                "sourceId": sourceId
            }, {'$set': {
                "processStatus": 1
            }})
        mongo.close()
        if diff_sourceCompanyId is not None:
            # #Set recommendIds
            # # insert audit_source_company
            # parser_mysql_util.insert_audit_source_company(diff_sourceCompanyId)
            # parser_mysql_util.update_db_processStatus(source, sourceId, 1)
            pass
Exemple #2
0
def process_news(column, d_map, content, download_crawler):
    download_crawler = download.DownloadCrawler(use_proxy=False)
    d = pq(html.fromstring(content.decode("utf-8", "ignore")))

    category = None
    categoryNames = []
    newsurl = d_map['link']

    key = re.search('.*?(\d+)/.*', newsurl).group(1)
    type = TYPE

    title = d('h1.title').text().strip()
    brief = d('div.field-item> p').text().strip()
    publish_time = d(
        'div.pfcng-row-01> div.pfcng-col-1> div.pos-2> div> div.node-published'
    ).attr('content')
    # newspost = d('div.pane-node-field-images> div> div> div> div> img').html()
    # logger.info('%s | %s | %s  '%(title,brief,publish_time))
    # (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
    # if posturl is not None:
    #     post = str(posturl)
    # else:
    post = None

    tags = []
    try:
        newstime = publish_time.split('+')[0].replace('T', ' ')
        news_time = datetime.datetime.strptime(newstime, "%Y-%m-%d %H:%M:%S")
    except Exception, e:
        logger.info(e)
        news_time = datetime.datetime.now()
Exemple #3
0
def start_run(concurrent_num, flag):
    global DATE
    global CURRENT_PAGE
    while True:
        listcrawler = ListCrawler()
        newscrawler = ListCrawler()
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # download_crawler = None

        logger.info("%s news %s start...", NEWSSOURCE, flag)
        # #Re download news of 24 hours
        # dt = datetime.date.today()
        # if DATE != dt:
        #     logger.info("Date changed!!! Back to yesterday")
        #     today = datetime.datetime(dt.year, dt.month, dt.day)
        #     yesterday = datetime.datetime(dt.year, dt.month, dt.day) - datetime.timedelta(days=1)
        #     mongo = db.connect_mongo()
        #     collection_news = mongo.article.news
        #     for nn in list(collection_news.find({"source": SOURCE, "createTime": {"$gt":yesterday, "$lt": today}})):
        #         link = nn["link"]
        #         logger.info("Redownload %s", link)
        #         crawler_news(column={}, crawler=newscrawler, newsurl=link, newspost=None, download_crawler=download_crawler)
        #     DATE = dt

        for column in columns:
            CURRENT_PAGE = 1
            run(flag, column, listcrawler, newscrawler, concurrent_num,
                download_crawler)

        logger.info("%s news %s end.", NEWSSOURCE, flag)

        if flag == "incr":
            time.sleep(60 * 8)  #30 minutes
        else:
            return
Exemple #4
0
def start_run(concurrent_num, codes, flag):
    download_crawler = None
    download_crawler = download.DownloadCrawler(use_proxy=1)

    if len(codes) == 0:
        codesMongo = list(collectionUser.find())
        codes = [i['code'] for i in codesMongo]

    while True:
        logger.info("%s  start...", SOURCENAME)

        zhihucrawler = Zhihucrawler()
        # download_crawler = download.DownloadCrawler(use_proxy=False)

        run(zhihucrawler, concurrent_num, codes, flag, download_crawler)

        logger.info("%s end.", SOURCENAME)

        # return

        if flag == "incr":
            logger.info('sleeping')
            gevent.sleep(60 * 60)  # 30 minutes
        else:
            return
Exemple #5
0
def process_news(column,  content, msg, download_crawler):
    download_crawler = download.DownloadCrawler(use_proxy=False)
    d = pq(html.fromstring(content.decode("utf-8", "ignore")))

    title = msg['title']
    newsurl = msg['link']
    brief = msg['brief']
    newspost = msg['post']
    post_time = msg['newsDate']

    category = None
    categoryNames = []

    key = re.search('https://vulcanpost.com/(\d+)/.*',newsurl).group(1)
    type = TYPE

    (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
    if posturl is not None:
        post = str(posturl)
    else:
        post = None

    tags = []

    try:
        news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S")
        logger.info("news-time: %s", news_time)
    except Exception, e:
        logger.info(e)
        news_time = datetime.datetime.now()
Exemple #6
0
def download():
    crawler = download.DownloadCrawler()

    url = "http://www.sgs.gov.cn/notice/captcha"
    i = 100
    while i<1000:
        i += 1
        image = crawler.get_image(url, max_retry=2)
        if image is not None:
            f = open("logs/%s.jpg" % i,'wb')
            f.write(image)
            f.close()
Exemple #7
0
def process(content, wechatcrawler, wechatprocess):
    j = json.loads(content)
    infos = j["value"]["datas"]
    logger.info("Got %s news", len(infos))
    cnt = 0
    download_crawler = download.DownloadCrawler(use_proxy=False)
    if len(infos) == 0:
        return cnt
    mongo = db.connect_mongo()
    collection_news = mongo.article.news
    for info in infos:

            wexinlink = info["url"]
            readNum = int(info["clicks_count"])
            likeNum = int(info["like_count"])
            title = info["title"]
            try:
                publicTime = datetime.datetime.strptime(info["public_time"],"%Y-%m-%d %H:%M:%S.0")- datetime.timedelta(hours=8)
            except:
                publicTime = datetime.datetime.now() - datetime.timedelta(hours=8)

            logger.info("link: %s", wexinlink)
            logger.info("article : %s, read: %s, like: %s", title, readNum, likeNum)

            item = collection_news.find_one({"link": wexinlink})
            # item2 = collection_news.find_one({"title": title})

            if item is None:
                dnews = wechatprocess.crawler_news(wechatcrawler, wexinlink, download_crawler, wechatId="微信公众号")
                # for a in dnews:
                #     logger.info("%s _> %s", a, dnews[a])
                dnews["date"] = publicTime
                dnews["clicksCount"] = readNum
                dnews["likeCount"] = likeNum
                # dnews["wechatId"] = wechatId
                # dnews["wechatName"] = wechatName
                dnews["processStatus"] = 0
                dnews["imgChecked"] = True
                # dnews["sectors"] = [20]

                if dnews["result"] == 'SUCCESS' and len(dnews["contents"])>=1:
                    dnews.pop('result')
                    try:
                        collection_news.insert(dnews)
                        cnt += 1
                    except Exception, e:
                        logger.info(e)
                        pass
            else:
                if item["source"] == 13841:
                    logger.info("Update click/update: %s/%s", readNum, likeNum)
                    collection_news.update_one({"_id": item["_id"]}, {"$set": {"clicksCount": readNum, "likeCount": likeNum}})
Exemple #8
0
def start_run():
    download_crawler = download.DownloadCrawler(use_proxy=False)

    while True:
        logger.info("Begin...")
        items = list(collection.find({"source": SOURCE, "parsed": {"$ne": True}}).limit(100))
        for item in items:
            parse(item, download_crawler)
            # break
        logger.info("End.")
        # break
        if len(items) == 0:
            time.sleep(60)
Exemple #9
0
def run_xiniu(crawler=Zhihucrawler()):
    url = 'https://www.zhihu.com/api/v4/members/xi-niu-shu-ju/articles?include=data%5B*%5D.comment_count%2Ccan_comment%2Ccomment_permission%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20&sort_by=created'
    data = {'authorization': "oauth c3cef7c66a1843f8b3a9e6a1e3160e20"}

    while True:
        result = crawler.crawl(url, headers=data, agent=True)
        if result['get'] == 'success':
            process_xiniu(
                result,
                code='xi-niu-shu-ju',
                flag='incr',
                download_crawler=download.DownloadCrawler(use_proxy=1))
            break
Exemple #10
0
def start_run(flag):
    while True:
        logger.info("%s news %s start...", NEWSSOURCE, flag)
        listcrawler = ListCrawler()
        newscrawler = NewsCrawler()
        download_crawler = download.DownloadCrawler(use_proxy=False)

        run(flag, listcrawler, newscrawler, download_crawler)

        logger.info("%s news %s end.", NEWSSOURCE, flag)

        if flag == "incr":
            time.sleep(60 * 8)
        else:
            return
Exemple #11
0
def start_run(concurrent_num, flag):
    global CURRENT_PAGE
    while True:
        logger.info("%s news %s start...", NEWSSOURCE, flag)
        listcrawler = ListCrawler()
        newscrawler = NewsCrawler()
        download_crawler = download.DownloadCrawler(use_proxy=False)
        for column in columns:
            CURRENT_PAGE = 1
            run(flag, column, listcrawler, newscrawler, concurrent_num, download_crawler)

        logger.info("%s news %s end.", NEWSSOURCE, flag)

        if flag == "incr":
            gevent.sleep(60*8)        #30 minutes
        else:
            return
Exemple #12
0
def start_run(concurrent_num, flag):
    global CURRENT_PAGE
    while True:
        logger.info("%s news %s start...", NEWSSOURCE, flag)
        listcrawler = ListCrawler()
        newscrawler = ListCrawler()
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # download_crawler = None
        forums = get_columns(listcrawler)
        for forumlink in forums:
            column = {"column": forumlink, "max": 1}
            CURRENT_PAGE = 1
            run(flag, column, listcrawler, newscrawler, concurrent_num,
                download_crawler)

        logger.info("%s news %s end.", NEWSSOURCE, flag)

        if flag == "incr":
            time.sleep(60 * 50)  #30 minutes
        else:
            return
Exemple #13
0
def start_run(concurrent_num, codes, flag):
    download_crawler = None
    download_crawler = download.DownloadCrawler(use_proxy=1)

    while True:
        logger.info("%s  start...", SOURCENAME)

        zhihucrawler = Zhihucrawler()
        # download_crawler = download.DownloadCrawler(use_proxy=False)

        run(zhihucrawler, concurrent_num, codes, flag, download_crawler)

        logger.info("%s end.", SOURCENAME)

        # return

        if flag == "incr":
            logger.info('sleeping')
            gevent.sleep(60 * 60)  # 30 minutes
        else:
            return
Exemple #14
0
def process_news(content, download_crawler):
    download_crawler = download.DownloadCrawler(use_proxy=False)

    category = None
    categoryNames = []

    key = content['id']
    type = TYPE
    title = content['title']

    mongo = db.connect_mongo()
    collection_news = mongo.article.news
    if collection_news.find_one({"title": title}) is not None:
        mongo.close()
        return
    newspost = content.get('featured_image').get('source')
    (posturl, width,
     height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler,
                                                 SOURCE, key, "news")
    if posturl is not None:
        post = str(posturl)
    else:
        post = None
    # logger.info(post)

    tags = []
    for tag in content['tags']:
        tags.append(tag['name'])
    brief = content['seo']['description']
    try:
        post_time = content['modified_gmt']
        news_time = None
        if post_time.find('T'):
            post_time = post_time.replace('T', ' ')
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
    except Exception, e:
        logger.info(e)
        news_time = datetime.datetime.now()
Exemple #15
0
def crawlerNews(link, pdate = None):
    download_crawler = download.DownloadCrawler(use_proxy=False)
    download_crawler_n = None

    if link.find("pencilnews.cn") >= 0:
        pencil_news_v2.crawler_news({}, pencil_news_v2.NewsCrawler(), link, None, download_crawler)
    elif link.find("lieyunwang.com") >= 0:
        lieyun_news.run_news(lieyun_news.LieyunNewsCrawler(), link)
    elif link.find("iyiou.com") >= 0:
        iyiou_news.crawler_news({}, iyiou_news.NewsCrawler(), link)
    elif link.find("huxiu.com") >= 0:
        huxiu_news.crawler_news({}, huxiu_news.NewsCrawler(), link, None, download_crawler)
    elif link.find("leiphone.com") >= 0:
        leiphone_news.process(leiphone_news.Contentcrawler(), link)
    elif link.find("36kr.com") >= 0 :
        kr36_news.run_news(kr36_news.kr36NewsCrawler(), link)
    elif link.find("mp.weixin.qq.com") >= 0:
        wechatcrawler = Wechatcrawler.WechatCrawler()
        wechatprocess = Wechatcrawler.NewsDownloader()
        dnews = wechatprocess.crawler_news(wechatcrawler, link, download_crawler, wechatId="微信公众号")
        # dnews["wechatId"] = "微信公众号"
        # dnews["wechatName"] = "微信公众号"
        # try:
        #     dnews["date"] = datetime.datetime.strptime(pdate,"%Y-%m-%d %H:%M:%S") - datetime.timedelta(hours=8)
        # except:
        #     dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        if dnews["result"] == 'SUCCESS' and dnews.has_key("contents") is True and len(dnews["contents"]) >= 1:
            dnews.pop('result')
            try:
                mongo = db.connect_mongo()
                collection_news = mongo.article.news
                id = collection_news.insert(dnews)
                mongo.close()
                logger.info("Done %s", id)
                # collection_news.insert(dnews)
            except Exception, e:
                logger.info(e)
                pass
Exemple #16
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        # logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-2].replace(".html", "")

        type = TYPE

        title = d('h1.single-title').text().strip()

        newspost = d('header> img.wp-post-image').attr("src")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)
        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        # try:
        #    post_time = topic
        #
        #    logger.info(post_time)
        #    news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S")
        #    logger.info("news-time: %s", news_time)
        # except Exception, e:
        #     logger.info(e)
        news_time = datetime.datetime.now()

        article = d('section.post_content').html()
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        processStatus = 0
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["data"].find(
                    "Continue reading this story with a subscription to DealStreetAsia"
            ) >= 0:
                processStatus = -5
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        if processStatus != 0:
            dnews["processStatus"] = processStatus
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s | %s", nid, processStatus)
            pass
    return
Exemple #17
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("gbk")))

        key = newsurl.split("/")[-1].replace(".htm", "")

        type = TYPE

        category = None
        categoryNames = []

        title = d('div.hd> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        # post = d('div#post_thumbnail> img').attr("src")
        # postraw = newspost
        # # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        # (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        # if posturl is not None:
        #     post = str(posturl)
        # else:
        #     post = None

        post = None
        brief = d("meta[name='description']").attr("content")

        post_time = d('div.a_Info> span.a_time').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.bd> div.Cnt-Main-Article-QQ').html()
        contents = extract.extractContents(newsurl, article)
        # logger.info(contents)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Exemple #18
0
        for column in columns:
            CURRENT_PAGE = 1
            run(flag, column, listcrawler, newscrawler, concurrent_num,
                download_crawler)

        logger.info("%s news %s end.", NEWSSOURCE, flag)

        if flag == "incr":
            time.sleep(60 * 38)  #30 minutes
        else:
            return
            #gevent.sleep(86400*3)   #3 days


if __name__ == "__main__":
    if len(sys.argv) > 1:
        param = sys.argv[1]
        if param == "incr":
            start_run(1, "incr")
        elif param == "all":
            start_run(1, "all")
        else:
            link = param
            download_crawler = download.DownloadCrawler(use_proxy=False)
            # download_crawler = None
            crawler_news({
                "column": "new",
                "max": 1
            }, NewsCrawler(), link, "", download_crawler, "投融资")
    else:
        start_run(1, "incr")
Exemple #19
0
def process(content, wechatcrawler, wechatprocess):
    # j = json.loads(content)
    # infos = j["value"]
    # logger.info("Got %s news", len(infos))
    cnt = 0
    d = pq(html.fromstring(content.decode("utf-8")))
    title = d('head> title').text().strip()
    logger.info("title: %s", title)

    download_crawler = download.DownloadCrawler(use_proxy=False)

    mongo = db.connect_mongo()
    collection_news = mongo.article.news
    for li in d('div.news-box> ul.news-list>li'):
        try:

            title = d(li)('h3> a').text()
            title = "".join(title.split(" "))
            wexinlink = d(li)('h3> a').attr("href").strip()
            post_time = d('div.s-p').attr("t")
            logger.info(post_time)
            try:

                post_time = time.localtime(int(post_time))
                news_time = datetime.datetime(
                    post_time.tm_year, post_time.tm_mon, post_time.tm_mday,
                    post_time.tm_hour, post_time.tm_min, post_time.tm_sec)
                if news_time is None:
                    news_time = datetime.datetime.now()
            except:
                news_time = datetime.datetime.now()
            logger.info("link: %s", wexinlink)
            logger.info("article : %s,%s", title, news_time)

            item = collection_news.find_one({"link": wexinlink})
            item2 = collection_news.find_one({"title": title})
            # # item2 = collection_news.find_one({"title": title})
            # logger.info(item)
            # logger.info(item2)
            if item is None and item2 is None:
                logger.info("here crawler")
                dnews = wechatprocess.crawler_news(wechatcrawler,
                                                   wexinlink,
                                                   download_crawler,
                                                   wechatId="微信公众号")

                # dnews["wechatId"] = wechatId
                # dnews["wechatName"] = wechatName
                dnews["title"] = title
                dnews["date"] = news_time - datetime.timedelta(hours=8)
                dnews["processStatus"] = 0
                dnews["imgChecked"] = True
                dnews["category"] = None

                if dnews["result"] == 'SUCCESS' and len(
                        dnews["contents"]) >= 1:
                    dnews.pop('result')
                    try:
                        id = collection_news.insert(dnews)
                        logger.info("**************: %s", id)
                        cnt += 1
                    except Exception, e:
                        logger.info(e)
                        pass
        except:
            pass

    mongo.close()
    return cnt
Exemple #20
0
def process_news(column, newsurl, content, newspost, download_crawler):
    logger.info('starting process_news %s', newsurl)
    # if has_news_content(content):
    if 1:
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1]

        # type = TYPE

        category = None

        title = d('.article_title p').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            logger.info('title:%s already exists' % title)
            return

        tags = []
        articletags = d(".labs a").text().strip()
        if articletags is not None:
            for tag in articletags.split():
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        postraw = newspost
        # post = d('div#post_thumbnail> img').attr("src")
        # if post is not None:
        #     post = "http://vcbeat.com"+ post

        brief = None
        # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip()

        # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content)
        news_time = d('.time').text().strip()
        news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M')
        # dt = datetime.date.today()
        # today = datetime.datetime.now()
        # if news_time is None or news_time > today:
        #     news_time = datetime.datetime.now()

        article = d('.art_text').html()
        contents = extract.extractContents(newsurl, article, document=False)
        # if len(contents)==0:
        #     contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, postraw)
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     logger.info( 'title:%s already exists'%title)
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": TYPE,
            "original_tags": None,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": None,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        dnews["brief"] = brief

        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        # update link content with oldId
        item = collection_news.find_one({"link": newsurl})
        if item is None:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
        else:
            logger.info("update %s", newsurl)
            # collection_news.update_many({'link': newsurl},{'$set': dnews})

            # oldId = collection_news.find_one({"link": newsurl})['_id']
            # collection_news.delete_one({"link": newsurl})
            # dnews['_id']=oldId
            # collection_news.insert(dnews)
        mongo.close()
        logger.info("*************DONE*************")

    return
Exemple #21
0
import pymongo
reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../support'))
import loghelper
import util, name_helper, url_helper, download, db

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
import parser_db_util

#logger
loghelper.init_logger("card_d3", stream=True)
logger = loghelper.get_logger("card_d3")

download_crawler = download.DownloadCrawler(use_proxy=True)
SOURCE = 13121
#parse data from qimingpian directly, bamy called it step 1 to checkout company



def parse_company(item):
    logger.info("parse_company")
    company_key = item["postdata"]["id"]

    #company basic info
    c = item["data"]["basic"]

    tags = c["tags"]

    tags_str = tags.replace("|",",")
Exemple #22
0
def process_news(content, news_key, url):
    if has_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content))
        brief = d("meta[name='description']").attr("content").split(",")[-1]
        title = d('div#article> div.single-item> div.article-hd> h1').text().strip()
        pagetitle = d('head> title').text().strip()
        temp = pagetitle.split("-")[-2]
        categoryNames = []
        if temp.strip() == "初页":
            category = 60102
            categoryNames.append("产品")
        elif temp.strip() == 'IPO/并购':
            category = 60105
            categoryNames.append("大公司")
        else:
            category = None
        post_time = d('div.author-time> span.date-time').attr("data-time")
        post_date = time.localtime(int(post_time))
        news_time = datetime.datetime(post_date.tm_year, post_date.tm_mon, post_date.tm_mday, post_date.tm_hour,
                                      post_date.tm_min, post_date.tm_sec)
        key = news_key
        column = d('div.article-tags> a').text()
        tags = column.split()
        logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, post_time, news_time, temp,  category, ":".join(tags))
        article = d('div#article> div> div.article-content').html()
        # # logger.info(article)
        contents = extract.extractContents(url, article)

        if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None:
            return
            # collection_news.delete_one({"source": SOURCE, "key_int": int(key)})

        if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
            return
            # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})

        # for t in contents:
        #    logger.info(t["data"])
        #    logger.info("")
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "" or desc_helper.check_desc(brief,2) is False:
            brief = util.get_brief_from_news(dcontents)
        # post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post

        post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Exemple #23
0
def process_news(column, newsurl, content, newspost):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        key = content["news"]["id"]

        newsurl = "https://www.chinaventure.com.cn/cmsmodel/news/detail/%s.shtml" % key

        type = TYPE

        category = None
        categoryNames = []
        if content["news"].has_key("newsChannelId"):
            if content["news"]["newsChannelId"] == 52:
                category = 60101
                categoryNames.append("融资")

        if content["news"].has_key("tagName"):
            if content["news"]["tagName"] == '人物':
                category = 60103

        tags = []
        if content.has_key("keywordList") is True and len(
                content["keywordList"]) > 0:
            for tag in content["keywordList"]:
                if tag.has_key("keyword") and tag[
                        "keyword"] is not None and tag["keyword"].strip(
                        ) != "" and tag["keyword"] not in tags:
                    tags.append(tag["keyword"])

        title = content["news"]["title"].replace("&quot;", "\"")

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            logger.info(
                "***************************News existed!!!***********************"
            )
            mongo.close()
            return

        # post = d('div#post_thumbnail> img').attr("src")
        postraw = "http://pic.chinaventure.com.cn/" + content["news"][
            "coverImg"]
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = content["news"]["introduction"]

        post_time = content["news"]["updateAt"]

        news_time = extract.extracttime(str(post_time))
        if news_time is None:
            news_time = datetime.datetime.now()

        article = pq(content["news"]["content"]).html()
        contents = extract.extractContents(newsurl, article)
        # for c in contents:
        #     logger.info(c["data"])
        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, post)
        # return
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     logger.info("***************************News existed!!!***********************")
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("img.mp.itc.cn") >= 0:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        # dnews["post"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        # logger.info("*************DONE*************")
    return
Exemple #24
0
def process_news(column, j_content, content, download_crawler):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = j_content['id']
        type = TYPE
        title = j_content['title']

        newspost = j_content.get('image')
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        brief = j_content['description']
        newsurl = j_content['uri']

        try:
            date = j_content['date']
            post_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(str(date)[:-3])))
            news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") - datetime.timedelta(days=1)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        article = d('div.article-container').html()
        contents = extract.extractContents(newsurl, article,document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief)
        flag, domain = url_helper.get_domain(newsurl)

        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                if c["data"].find("Share to facebookShare to twitterShare to linkedin") >= 0:
                    c['data'] = c['data'].replace('Share to facebookShare to twitterShare to linkedin', '')
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
Exemple #25
0
def process_news(content, url):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))
        download_crawler = download.DownloadCrawler(use_proxy=False)
        title = d(
            'div.post-img-left> div> div.post-head> h1.title').text().strip()
        post_time = d('article.post-article').attr("ptime")
        post_Date = time.localtime(int(post_time))
        news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon,
                                      post_Date.tm_mday, post_Date.tm_hour,
                                      post_Date.tm_min, post_Date.tm_sec)

        if collection_news.find_one({"link": url}) is not None:
            return
            # collection_news.delete_one({"link": url})

        if collection_news.find_one({
                "title": title,
                "source": {
                    "$ne": SOURCE
                }
        }) is not None:
            return

        key = d('article.post-article').attr("postid")
        try:
            key_int = int(key)
        except:
            key_int = None
        column = d('span.post-category').text().strip()
        brief = d("meta[name='description']").attr("content").strip()

        if column is not None:
            tags = column.split()
        else:
            tags = []

        categoryNames = []
        if "人物" in tags:
            category = 60103
        elif "公司" in tags:
            category = 60105
            categoryNames.append("大公司")
        else:
            category = None

        keywords = d("meta[name='keywords']").attr("content")
        if keywords is not None:
            for keyword in keywords.split(","):
                if keyword is not None and keyword.strip(
                ) not in tags and keyword.strip() not in ["PingWest", "品玩"]:
                    tags.append(keyword.strip())

        postraw = d("link[rel='image_src']").attr("href")
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time,
                    news_time, brief, ":".join(tags), category, post)
        article = d('div.box-con> div#sc-container').html()
        # logger.info(article)
        contents = extract.extractContents(url, article)

        # if collection_news.find_one({"link": url}) is not None:
        #     return
        #     # collection_news.delete_one({"link": url})
        #
        # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
        #     return
        # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=16),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": key_int,
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []

        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""),
                # }
                if download_crawler is None:
                    dc = {
                        "rank":
                        rank,
                        "content":
                        "",
                        "image":
                        "",
                        "image_src":
                        c["data"].replace("?imageView2/2/w/750/q/90", ""),
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"].replace("?imageView2/2/w/750/q/90", ""),
                         download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Exemple #26
0
def process(crawler):
    while True:
        if len(URLS) == 0: return
        linkDict = URLS.pop(0)

        retry = 0

        while True:
            retry += 1
            if retry > 6: break
            download_crawler = download.DownloadCrawler(use_proxy=False)
            url = linkDict['href']
            result = crawler.crawl(url)
            if result['get'] == 'success':
                d = pq(html.fromstring(result['content'].decode("utf-8")))

                title = linkDict['title']
                key = url.split('/')[-1]

                category = d('.al-crumbs a:nth-child(2)').text()

                if categoryDict.has_key(category):
                    TYPE = categoryDict[category]['type']
                    category = categoryDict[category]['category']
                else:
                    TYPE = 60001
                    category = None

                brief = linkDict['brief']
                postraw = linkDict['post']

                tags = []
                # for tag in d('.tags').text().split():
                #     if tag.strip() not in tags: tags.append(tag)

                news_time = d('.article__published').eq(0).text()
                # news_time = datetime.datetime.strptime(' '.join(news_time.split(' ')[:2]), '%Y年%m月%d日 %H:%M')
                # news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %p %I:%M')
                news_time = datetime.datetime.strptime(news_time,
                                                       '%Y/%m/%d %H:%M')

                flag, domain = url_helper.get_domain(url)
                dnews = {
                    "date": news_time - datetime.timedelta(hours=8),
                    "title": title,
                    "link": url,
                    "createTime": datetime.datetime.now(),
                    "source": SOURCE,
                    "key": key,
                    "key_int": None,
                    "type": TYPE,
                    "original_tags": tags,
                    "processStatus": 0,
                    # "companyId": None,
                    "companyIds": [],
                    "category": category,
                    "domain": domain,
                    "categoryNames": []
                }

                article = d('.article__content').html()
                contents = extract.extractContents(url, article)
                dcontents = []
                rank = 1
                for c in contents:
                    if c["type"] == "text":
                        dc = {
                            "rank": rank,
                            "content": c["data"],
                            "image": "",
                            "image_src": "",
                        }
                    else:
                        if download_crawler is None:
                            dc = {
                                "rank": rank,
                                "content": "",
                                "image": "",
                                "image_src": c["data"],
                            }
                        else:
                            (imgurl, width,
                             height) = parser_mysql_util.get_logo_id_new(
                                 c["data"], download_crawler, SOURCE, key,
                                 "news")
                            if imgurl is not None:
                                dc = {
                                    "rank": rank,
                                    "content": "",
                                    "image": str(imgurl),
                                    "image_src": "",
                                    "height": int(height),
                                    "width": int(width)
                                }
                            else:
                                continue
                    dcontents.append(dc)
                    rank += 1
                dnews["contents"] = dcontents

                if brief is None or brief.strip() == "":
                    brief = util.get_brief_from_news(dcontents)

                # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
                (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                    postraw, download_crawler, SOURCE, key, "news")
                if posturl is not None:
                    post = str(posturl)
                else:
                    post = None
                if post is None or post.strip() == "":
                    post = util.get_posterId_from_news(dcontents)

                if download_crawler is None:
                    dnews["post"] = post
                else:
                    dnews["postId"] = post

                # brief=brief[:100]
                dnews["brief"] = brief

                mongo = db.connect_mongo()
                collection_news = mongo.article.news
                # update link content with oldId
                item = collection_news.find_one({"link": url})

                if len(dcontents) > 1:
                    if item is None:
                        # collection_news.insert(dnews)
                        nid = parser_mongo_util.save_mongo_news(dnews)
                        logger.info("Done: %s", nid)
                    else:
                        logger.info("update %s", url)
                        #     oldId = collection_news.find_one({"link": url})['_id']
                        #     collection_news.delete_one({"link": url})
                        #     dnews['_id'] = oldId
                        #     collection_news.insert(dnews)
                mongo.close()
                logger.info("%s, %s, %s, %s, %s, %s, %s", key, title,
                            news_time, category, " ".join(tags), brief, post)
                logger.info("*************DONE*************")
                break
Exemple #27
0
def process_news(column, newsurl, content, newspost, download_crawler):
    # if has_news_content(content):
    if 1:
        # logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode('gb2312', 'ignore')))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".htm", "")

        title = d('h1.title').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)

        brief = None

        news_time = d('.timer').text()
        news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M:%S')

        article = d('.content').html()
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            # "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        mongo.close()

        if news_classify.get_class(dcontents, 13866) == 1:
            logger.info('%s is fundingNews', title)
            TYPE = 60001
        else:
            TYPE = 60010
            logger.info('%s is not fundingNews', title)

        dnews['type'] = TYPE

        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
    return
Exemple #28
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        # d = pq(html.fromstring(content.decode("gbk","ignore")))
        utfflag = False
        if content.find("gb2312") == -1:
            d = pq(html.fromstring(content.decode("utf-8", "ignore")))
            utfflag = True
        else:
            d = pq(html.fromstring(content.decode("gbk", "ignore")))
        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".shtml", "")

        type = TYPE

        post = None

        if utfflag is True:
            title = d('div#titsize> strong').text().strip()
        else:
            title = d('div.titmain> h1').text().strip()
            # logger.info("title: %s", title)
            if title is None or title.strip() == "":
                title = d('div.texttitbox> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)

        # try:
        #     brief = d('div.daodu> p').text().strip().replace("【数据猿导读】","")
        # except:
        #     brief = None
        brief = None

        try:
            if utfflag is True:
                post_time = d("p.time> span.mh-title").text().strip()
            else:
                post_time = d("meta[property='og:release_date']").attr(
                    "content").split("+")[0]

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if utfflag is True:
            article = d('div.tbox.content').html()
        else:
            article = d('div.texttit_m1').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=20),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
Exemple #29
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        title = d('div.da-title> h2').text().strip()
        if title.find("融资") >= 0:
            type = 60001
            category = 60101

        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time = d("span.article-time").eq(0).text().strip()

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.data-article').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("btm地址") >= 0 or \
                    c["data"].find("版权声明") >= 0:
                continue

            if c["data"].find("8btctest1/custom/images") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
Exemple #30
0
def process_news(content, news_key, url, news_posttime):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode('utf-8')))
        title = d('header.article-header>h1').text().strip()
        if title is None or title.strip() == "":
            logger.info("wrong title for url: %s", url)
            return
        post_time = pq(content)("meta[name='sailthru.date']").attr("content")
        news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") + datetime.timedelta(hours=15)

        key = news_key
        try:
            postraw = pq(content)("meta[property='og:image']").attr("content")
            if postraw.find("techcrunch.opengraph.default.png")>=0:
                postraw = None
        except:
            postraw = None
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        divtags = d('div.tags> div.tag-item')
        tags = [pq(divtag)('a.tag').text().strip() for divtag in divtags if pq(divtag)('a.tag').text().strip() is not None]
        category = None
        logger.info("%s, %s, %s, %s, %s -> %s", key, title, post_time, news_time, ":".join(tags),category)

        article = d('div.article-entry.text').html()
        # logger.info(article)
        contents = extract.extractContents(url, article)

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None:
            mongo.close()
            return
            # collection_news.delete_one({"source": SOURCE, "key_int": int(key)})
        if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
            mongo.close()
            return
            # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        mongo.close()

        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "" or post.find("techcrunch.opengraph.default.png")>=0:
        #     post = util.get_poster_from_news(dcontents)
        #
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        if len(dcontents) > 0:
            # mongo = db.connect_mongo()
            # collection_news = mongo.article.news
            # collection_news.insert(dnews)
            # mongo.close()
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)

        logger.info("Done")