Esempio n. 1
0
def process_news(item, url, content, category_ori):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8")))
        if content.find("c-single-normal__title") >= 0:
            title = d('h1.c-single-normal__title').text().strip()
        elif content.find("c-article-header__title") >= 0:
            title = d('h1.c-article-header__title').text().strip()
        else:
            # exit()
            return
        try:
            post_time = pq(content)("meta[property='og:updated_time']").attr(
                "content").split("+")[0]
            news_time = datetime.datetime.strptime(post_time,
                                                   "%Y-%m-%dT%H:%M:%S")
        except:
            datecontent = d(
                'div.c-article-header-meta> span.c-article-header-meta__time'
            ).text().strip()
            logger.info("Date********%s", datecontent)
            result = util.re_get_result('(\d{4}\-)', datecontent)
            if result:
                news_time = datetime.datetime.strptime(datecontent,
                                                       "%Y-%m-%d %H:%M")
            else:
                post_time = str(dt.year) + '-' + datecontent
                news_time = datetime.datetime.strptime(post_time,
                                                       "%Y-%m-%d %H:%M")

        key = item["key"]
        column = d(
            'div.c-article-header-meta> span.c-article-header-meta__category'
        ).text().strip()
        brief = d("meta[name='description']").attr("content")[:100]

        if column is not None:
            tags = column.split()
        else:
            tags = []

        categoryNames = []
        category = categoryDict[category_ori]
        if category == 60105: categoryNames.append("大公司")
        # if category == None:
        #     if "访谈" in tags:
        #         category = 60103
        #     elif "范品" in tags or "产品" in tags:
        #         category = 60102
        #     else:
        #         category = None

        keywords = d('div#article-content> div.c-article-tags').text()
        if keywords is not None:
            for keyword in keywords.split():
                if keyword is not None and keyword.strip() not in tags:
                    tags.append(keyword.strip())
        logger.info("%s, %s, %s, %s, %s, %s", key, title, news_time, category,
                    ":".join(tags), brief)
        article = d('article.s-single-article').html()
        #logger.info(article)
        contents = extract.extractContents(url, article)

        if collection_news.find_one({"link": url}) is not None:
            return
            # collection_news.delete_one({"link": url})
        if collection_news.find_one({
                "title": title,
                "source": {
                    "$ne": SOURCE
                }
        }) is not None:
            return
            # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})

        postraw = d("meta[property='og:image']").attr("content")
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "" or (post.find("http://") == -1 and post.find("https://") == -1):
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post

        if post is None or post.strip() == "" or (
                post.find("http://") == -1 and post.find("https://") == -1):
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        dnews["brief"] = brief[:100]

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Esempio n. 2
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8","ignore")))

        key = newsurl.split("/")[-1].replace("i","")

        type = TYPE

        category = None
        title = d('head> title').text().strip()

        r = "content: '(.*?)',.*groupId"

        result = util.re_get_result(r.strip()[:-1], content)
        (b,) = result
        logger.info(b)

        # exit()
        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.replace(",", ",").split(","):
                if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
                    tags.append(tag)



        post = None

        brief = None
        news_time = None
        try:
            r1 = "time: '(.*?)'.*},.*tagInfo"

            result = util.re_get_result(r1, content)
            (post_time,) = result
            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except:
            pass
        if news_time is None:
            news_time = datetime.datetime.now()
        # exit()
        # article = d('div.post> div.post-content').html()
        # contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": 60101,
            "domain": domain,
            "categoryNames": [],
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        bb = b.replace('&lt;', "<").replace("&gt;",">").replace("&quot;","\"").replace("&#x3D;","=")
        logger.info(bb)

        contents = extract.extractContents(newsurl, bb, document=False)
        for c in contents:
            logger.info(c["data"])
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        # for c in b.replace("&lt;div&gt;&lt;p&gt;",'').replace("&lt;/p&gt;&lt;/div&gt;","").split('&lt;/p&gt;&lt;p&gt;'):
        #     logger.info(c)
        #     if c.find("转载务必署名来源")>=0 or c.find("&lt;/p&gt;&lt;/div&gt;")>=0 or c.find("&lt;div&gt;&lt;p&gt; ")>=0:
        #         continue
        #     if c.find("img") >= 0:
        #         c = re.sub(r'&lt;(.*)?img.*&quot;0&quot;&gt;',"",c)
        #         dc = {
        #             "rank": rank,
        #             "content": c,
        #             "image": "",
        #             "image_src": "",
        #         }
        #     else:
        #         dc = {
        #             "rank": rank,
        #             "content": c,
        #             "image": "",
        #             "image_src": "",
        #         }
        #     # else:
        #     #     if download_crawler is None:
        #     #         dc = {
        #     #             "rank": rank,
        #     #             "content": "",
        #     #             "image": "",
        #     #             "image_src": c,
        #     #         }
        #     #     else:
        #     #         (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c, download_crawler, SOURCE, key, "news")
        #     #         if imgurl is not None:
        #     #             dc = {
        #     #                 "rank": rank,
        #     #                 "content": "",
        #     #                 "image": str(imgurl),
        #     #                 "image_src": "",
        #     #                 "height": int(height),
        #     #                 "width": int(width)
        #     #             }
        #     #         else:
        #     #             continue
        #
        #     logger.info(c)
        #     dcontents.append(dc)
        #     rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        mid = None
        if title is not None and len(dcontents) > 0:
            # mid = collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
        # logger.info("*************DONE*************%s",mid)
    return
Esempio n. 3
0
def process_news(content, url):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))
        download_crawler = download.DownloadCrawler(use_proxy=False)
        title = d(
            'div.post-img-left> div> div.post-head> h1.title').text().strip()
        post_time = d('article.post-article').attr("ptime")
        post_Date = time.localtime(int(post_time))
        news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon,
                                      post_Date.tm_mday, post_Date.tm_hour,
                                      post_Date.tm_min, post_Date.tm_sec)

        if collection_news.find_one({"link": url}) is not None:
            return
            # collection_news.delete_one({"link": url})

        if collection_news.find_one({
                "title": title,
                "source": {
                    "$ne": SOURCE
                }
        }) is not None:
            return

        key = d('article.post-article').attr("postid")
        try:
            key_int = int(key)
        except:
            key_int = None
        column = d('span.post-category').text().strip()
        brief = d("meta[name='description']").attr("content").strip()

        if column is not None:
            tags = column.split()
        else:
            tags = []

        categoryNames = []
        if "人物" in tags:
            category = 60103
        elif "公司" in tags:
            category = 60105
            categoryNames.append("大公司")
        else:
            category = None

        keywords = d("meta[name='keywords']").attr("content")
        if keywords is not None:
            for keyword in keywords.split(","):
                if keyword is not None and keyword.strip(
                ) not in tags and keyword.strip() not in ["PingWest", "品玩"]:
                    tags.append(keyword.strip())

        postraw = d("link[rel='image_src']").attr("href")
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time,
                    news_time, brief, ":".join(tags), category, post)
        article = d('div.box-con> div#sc-container').html()
        # logger.info(article)
        contents = extract.extractContents(url, article)

        # if collection_news.find_one({"link": url}) is not None:
        #     return
        #     # collection_news.delete_one({"link": url})
        #
        # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
        #     return
        # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=16),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": key_int,
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []

        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""),
                # }
                if download_crawler is None:
                    dc = {
                        "rank":
                        rank,
                        "content":
                        "",
                        "image":
                        "",
                        "image_src":
                        c["data"].replace("?imageView2/2/w/750/q/90", ""),
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"].replace("?imageView2/2/w/750/q/90", ""),
                         download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Esempio n. 4
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        # d = pq(html.fromstring(content.decode("gbk","ignore")))
        utfflag = False
        if content.find("gb2312") == -1:
            d = pq(html.fromstring(content.decode("utf-8", "ignore")))
            utfflag = True
        else:
            d = pq(html.fromstring(content.decode("gbk", "ignore")))
        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".shtml", "")

        type = TYPE

        post = None

        if utfflag is True:
            title = d('div#titsize> strong').text().strip()
        else:
            title = d('div.titmain> h1').text().strip()
            # logger.info("title: %s", title)
            if title is None or title.strip() == "":
                title = d('div.texttitbox> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)

        # try:
        #     brief = d('div.daodu> p').text().strip().replace("【数据猿导读】","")
        # except:
        #     brief = None
        brief = None

        try:
            if utfflag is True:
                post_time = d("p.time> span.mh-title").text().strip()
            else:
                post_time = d("meta[property='og:release_date']").attr(
                    "content").split("+")[0]

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if utfflag is True:
            article = d('div.tbox.content').html()
        else:
            article = d('div.texttit_m1').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=20),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
Esempio n. 5
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        # logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-2].replace(".html", "")

        type = TYPE

        title = d('h1.single-title').text().strip()

        newspost = d('header> img.wp-post-image').attr("src")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)
        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        # try:
        #    post_time = topic
        #
        #    logger.info(post_time)
        #    news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S")
        #    logger.info("news-time: %s", news_time)
        # except Exception, e:
        #     logger.info(e)
        news_time = datetime.datetime.now()

        article = d('section.post_content').html()
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        processStatus = 0
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["data"].find(
                    "Continue reading this story with a subscription to DealStreetAsia"
            ) >= 0:
                processStatus = -5
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        if processStatus != 0:
            dnews["processStatus"] = processStatus
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s | %s", nid, processStatus)
            pass
    return
Esempio n. 6
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        logger.info("here")
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        category = None

        # title = d('div.des').text().strip()
        title = d('h1.entry-title').text().strip()

        if title is None or title == "":
            return
        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.replace(",", ",").split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)
        #

        # post = d('div#post_thumbnail> img').attr("src")
        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        # post = d("meta[property='og:image']").attr("content")
        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None
        # brief = None
        try:
            post_time = d('time.entry-date').attr("datetime").split("+")[0]
            news_time = datetime.datetime.strptime(post_time,
                                                   "%Y-%m-%dT%H:%M:%S")
        except:
            news_time = datetime.datetime.now()
        # article = d('div.artile_box> div.c').html()
        article = d('div.entry-content').html()
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": [],
            "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("◆END◆") >= 0 or c["data"].find("…………………") >= 0:
                break

            if c["data"].find("ACG 领域最具影响力的产业新媒体") >= 0 or c["data"].find(
                    "访问三文娱网站3wyu.com查看产业必读文章") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        mid = None
        if title is not None and len(contents) > 0:
            # mid = collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
        # logger.info("*************DONE************* %s", mid)
    return
Esempio n. 7
0
def process_news(column, newsurl, content, newsposttime, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].strip()

        type = TYPE

        title = d('div.article-wrap> div.article-head> h1').text().strip()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return
        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        category = None
        categoryNames = []
        if "投资并购" in tags:
            category = 60101
            categoryNames.append("融资")

        # post = d('div#post_thumbnail> img').attr("src")
        post = None

        brief = d("meta[name='description']").attr("content")

        news_time = None
        if newsposttime is not None:
            news_time = extract.extracttime(newsposttime)
        if news_time is None:
            dt = datetime.date.today()
            post_time = d(
                'div.article-wrap> div.article-head> p> span.article-time'
            ).text()
            if post_time is None or post_time.strip() == str(dt):
                news_time = datetime.datetime.now()
                # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d")
            else:
                news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d")

        article = d('div.article-wrap> div.article-content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                (imgurl, width, height) = parser_mysql_util.get_logo_id_new(
                    c["data"], download_crawler, SOURCE, key, "news")
                if imgurl is not None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": str(imgurl),
                        "image_src": "",
                        "height": int(height),
                        "width": int(width)
                    }
                else:
                    continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        dnews["postId"] = post
        dnews["brief"] = brief

        # Design for sector:
        dnews["sectors"] = [10]
        dnews["sector_confidence"] = [1]

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Esempio n. 8
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        # logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode('gbk', 'ignore')))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        title = d('div.article> h1').text().strip()


        (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)

        brief = None

        try:
            news_time = d('div.info> p> em').eq(0).text()
            news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M')
        except:
            news_time = datetime.datetime.now()

        article = d('div.article-t').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": str(key),
            # "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("var currentPage") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                break

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        # mongo.close()

        if news_classify.get_class(dcontents, 13866) == 1:
            logger.info('%s is fundingNews', title)
            TYPE = 60001
        else:
            TYPE = 60010
            logger.info('%s is not fundingNews', title)

        dnews['type'] = TYPE

        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
    return
Esempio n. 9
0
def process_news(item, url, content):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8")))

        title = d('div.main-wrap> header> h1').text().strip()
        post_time = d('div.topic-info> span.release-date> span').attr(
            "data-time")
        post_Date = time.localtime(int(post_time))
        news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon,
                                      post_Date.tm_mday, post_Date.tm_hour,
                                      post_Date.tm_min, post_Date.tm_sec)

        key = item["key"]
        column = d('div.main-wrap> div.label').text().strip()
        brief = d("meta[name='description']").attr("content")
        if brief is not None:
            brief = brief.strip()

        if column is not None:
            tags = column.split()
        else:
            tags = []

        category = None
        categoryNames = []
        if "深度报道" in tags:
            type = 60003
            category = 60107
        else:
            type = 60001

            if "极客早知道" in tags:
                category = 60105
                categoryNames.append("大公司")

        keywords = pq(
            content.decode("utf-8"))("meta[name='keywords']").attr("content")
        if keywords is not None:
            for keyword in keywords.split(","):
                if keyword is not None and keyword not in tags:
                    tags.append(keyword.strip())

        logger.info("%s, %s, %s, %s, %s, %s -> %s", key, title, post_time,
                    news_time, brief, ":".join(tags), category)
        article = d(
            'section.main-content> article> div.article-content').html()
        #logger.info(article)
        contents = extract.extractContents(url, article)

        if collection_news.find_one({
                "source": SOURCE,
                "key_int": int(key)
        }) is not None:
            return
            # collection_news.delete_one({"source": SOURCE, "key_int": int(key)})

        if collection_news.find_one({
                "title": title,
                "source": {
                    "$ne": SOURCE
                }
        }) is not None:
            return
            # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})

        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        article_img = d(
            'section.main-content> article> div.topic-cover> img').attr("src")
        if article_img is not None:
            dc = {
                "rank": 1,
                "content": "",
                "image": "",
                "image_src": article_img,
            }
            dcontents.append(dc)

        rank = len(dcontents) + 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Esempio n. 10
0
def process_news(column, newsurl, content):
    if has_news_content(content):
        d = pq(content)
        title = d('div#post_title').text()
        url = newsurl
        key = url.split('/')[-1]
        post_time = d('div#post_date').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        logger.info("title:%s, date:%s", title, news_time)
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": [],
            "processStatus": 0,
            # "companyId":companyId,
            "companyIds": [],
            "category": None,
            "domain": domain,
            "categoryNames": []
        }

        dcontents = []
        description = d('div#post_description').text()
        if description is not None:
            dc = {
                "rank": 1,
                "content": "亿欧快讯",
                "image": "",
                "image_src": "",
            }

            dcontents.append(dc)
            dc = {
                "rank": 2,
                "content": description.replace("【消息来源】", ""),
                "image": "",
                "image_src": "",
            }
            dcontents.append(dc)

            logger.info(description)

        dnews["contents"] = dcontents

        brief = util.get_brief_from_news(dcontents)

        post = util.get_posterId_from_news(dcontents)
        dnews["postId"] = post
        # dnews["post"] = post
        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        # collection_news.insert(dnews)
    return
Esempio n. 11
0
def process(crawler, outlink=None):
    while True:
        if outlink is None:
            if len(URLS) == 0: return
            linkDict = URLS.pop(0)
        else:
            linkDict = {
                "href": outlink,
                "post": None,
                "title": None
            }

        retries = 0
        while True:
            if retries > 10: break
            retries += 1
            download_crawler = download.DownloadCrawler(use_proxy=False)

            url = linkDict['href']
            result = crawler.crawl(url, agent=True)
            if result['get'] == 'success':
                d = pq(html.fromstring(result['content'].decode("utf-8", 'ignore')))

                title = linkDict['title']
                if title is None:
                    title = d('h1.headTit').text().strip()
                key = url.split('/')[-1].split('.')[0]

                # category = d('.al-crumbs a:nth-child(2)').text()
                #
                # if categoryDict.has_key(category):
                #     type = categoryDict[category]['type']
                #     category = categoryDict[category]['category']
                # else:
                #     type = 60001
                #     category = None

                brief = d('.article-lead').text().replace('导语:', '')
                postraw = linkDict['post']

                tags = []
                for tag in d('.related-link a'):
                    t = tag.text.strip()
                    if t not in tags: tags.append(t)

                news_time = d('.inner .time').text()
                news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M')

                flag, domain = url_helper.get_domain(url)

                try:
                    key_int = int(key)
                except:
                    key_int = None
                category = None

                dnews = {
                    "date": news_time - datetime.timedelta(hours=8),
                    "title": title,
                    "link": url,
                    "createTime": datetime.datetime.now(),
                    "source": SOURCE,
                    "key": key,
                    "key_int": key_int,
                    "type": 60001,
                    "original_tags": tags,
                    "processStatus": 0,
                    # "companyId": None,
                    "companyIds": [],
                    "category": category,
                    "domain": domain,
                    "categoryNames": []
                }

                article = d('.lph-article-comView').html()
                contents = extract.extractContents(url, article)
                dcontents = []
                rank = 1
                for c in contents:
                    if c["type"] == "text":
                        if c["data"].find("未经授权禁止转载。详情见转载须知。") > 0: continue
                        dc = {
                            "rank": rank,
                            "content": c["data"],
                            "image": "",
                            "image_src": "",
                        }
                    else:
                        if download_crawler is None:
                            dc = {
                                "rank": rank,
                                "content": "",
                                "image": "",
                                "image_src": c["data"],
                            }
                        else:
                            (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler,
                                                                                        SOURCE, key, "news")
                            if imgurl is not None:
                                dc = {
                                    "rank": rank,
                                    "content": "",
                                    "image": str(imgurl),
                                    "image_src": "",
                                    "height": int(height),
                                    "width": int(width)
                                }
                            else:
                                continue
                    dcontents.append(dc)
                    rank += 1
                dnews["contents"] = dcontents

                if brief is None or brief.strip() == "":
                    brief = util.get_brief_from_news(dcontents)

                # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
                (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key,
                                                                             "news")
                if posturl is not None:
                    post = str(posturl)
                else:
                    post = None
                if post is None or post.strip() == "":
                    post = util.get_posterId_from_news(dcontents)

                if download_crawler is None:
                    dnews["post"] = post
                else:
                    dnews["postId"] = post
                dnews["brief"] = brief

                mongo = db.connect_mongo()
                collection_news = mongo.article.news
                # update link content with oldId
                item = collection_news.find_one({"link": url})
                if item is None:
                    nid = parser_mongo_util.save_mongo_news(dnews)
                    logger.info("Done: %s", nid)
                else:
                    logger.info("update %s", url)
                    # oldId = collection_news.find_one({"link": url})['_id']
                    # collection_news.delete_one({"link": url})
                    # dnews['_id'] = oldId
                    # collection_news.insert(dnews)
                mongo.close()
                logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, news_time, category, " ".join(tags), brief, post)
                # logger.info("*************DONE*************")
                break
        if outlink is not None:
            break
Esempio n. 12
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):

        d = pq(html.fromstring(content.decode('utf-8', "ignore")))

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        category = None

        title = d('div#content> article> header> h1').text().strip()
        [author, cleanTitle] = clean_title(title)

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)

        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("meta[name='description']").attr("content")
        news_time = None
        post_time = d('li.post-time> time').text()
        logger.info(post_time)

        if post_time.find("月") >= 0:
            dt = datetime.date.today()
            today = datetime.datetime(dt.year, dt.month, dt.day)
            if datetime.datetime.strptime(post_time, "%Y年%m月%d日") == today:
                news_time = datetime.datetime.now()

        if news_time is None:
            if post_time is not None:
                news_time = datetime.datetime.strptime(post_time, "%Y年%m月%d日")
            else:
                news_time = datetime.datetime.now()

        article = d('div#content> article> div.entry-content').html()
        # logger.info(article)
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "author": author,
            "cleanTitle": cleanTitle,
            "categoryNames": []
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("报告下载")>=0 and c["data"].find("回复关键词")>=0 or c["data"].find("原创编译")>=0 or \
                            c["data"].find("199IT感谢您的支持!") >= 0:
                continue
            if c["data"].find("其它年份报告,请点击下载")>=0 or c["data"].find("var wum")>=0 or \
                    c["data"].find("原创编译自") >=0 or c["data"].find("更多阅读")>=0:
                break

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # mid = collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************%s",mid)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Esempio n. 13
0
def process(crawler):
    while True:
        if len(URLS) == 0: return
        linkDict = URLS.pop(0)
        retries = 0

        while True:
            if retries > 6: break
            retries += 1

            download_crawler = download.DownloadCrawler(use_proxy=False)
            url = linkDict['href']
            result = crawler.crawl(url)
            if result['get'] == 'success':
                d = pq(result['content'])

                title = linkDict['title']
                key = url.split('=')[-1]

                if categoryDict.has_key(linkDict['category']):
                    type = categoryDict[linkDict['category']]['type']
                    category = categoryDict[linkDict['category']]['category']
                else:
                    type = 60001
                    category = None

                brief = linkDict['brief']
                postraw = linkDict['post']

                tags = []
                for tag in d('.txt span em').text().split():
                    if tag.strip() not in tags: tags.append(tag)
                for tag in d('.pag span').text().split():
                    if tag.strip() not in tags: tags.append(tag)

                news_time = linkDict['date']

                flag, domain = url_helper.get_domain(url)
                dnews = {
                    "date": news_time - datetime.timedelta(hours=8),
                    "title": title,
                    "link": url,
                    "createTime": datetime.datetime.now(),
                    "source": SOURCE,
                    "key": key,
                    "key_int": int(key),
                    "type": type,
                    "original_tags": tags,
                    "processStatus": 0,
                    # "companyId": None,
                    "companyIds": [],
                    "category": category,
                    "domain": domain,
                    "categoryNames": []
                }

                article = d('.article .top').html()
                contents = extract.extractContents(url, article)
                dcontents = []
                rank = 1
                for c in contents:
                    if c["type"] == "text":
                        dc = {
                            "rank": rank,
                            "content": c["data"],
                            "image": "",
                            "image_src": "",
                        }
                    else:
                        if download_crawler is None:
                            dc = {
                                "rank": rank,
                                "content": "",
                                "image": "",
                                "image_src": c["data"],
                            }
                        else:
                            (imgurl, width,
                             height) = parser_mysql_util.get_logo_id_new(
                                 c["data"], download_crawler, SOURCE, key,
                                 "news")

                            if imgurl is not None:
                                dc = {
                                    "rank": rank,
                                    "content": "",
                                    "image": str(imgurl),
                                    "image_src": "",
                                    "height": int(height),
                                    "width": int(width)
                                }
                            else:
                                continue
                    dcontents.append(dc)
                    rank += 1
                dnews["contents"] = dcontents

                if brief is None or brief.strip() == "":
                    brief = util.get_brief_from_news(dcontents)

                # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
                (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                    postraw, download_crawler, SOURCE, key, "news")
                if posturl is not None:
                    post = str(posturl)
                else:
                    post = None
                if post is None or post.strip() == "":
                    post = util.get_posterId_from_news(dcontents)

                if download_crawler is None:
                    dnews["post"] = post
                else:
                    dnews["postId"] = post
                dnews["brief"] = brief

                mongo = db.connect_mongo()
                collection_news = mongo.article.news
                # update link content with oldId
                item = collection_news.find_one({"link": url})
                if item is None:
                    nid = parser_mongo_util.save_mongo_news(dnews)
                    logger.info("Done: %s", nid)
                else:
                    logger.info("update %s", url)
                    # oldId = collection_news.find_one({"link": url})['_id']
                    # collection_news.delete_one({"link": url})
                    # dnews['_id'] = oldId
                    # collection_news.insert(dnews)
                mongo.close()
                logger.info("%s, %s, %s, %s, %s, %s, %s", key, title,
                            news_time, category, " ".join(tags), brief, post)
                # logger.info("*************DONE*************")
                break
Esempio n. 14
0
def process_news(item):
    # if has_news_content(item):
    if 1:
        d = pq(item)
        title = d('.item-title').text()
        url = d('.item-title').attr('href')
        key = url.split('/')[-1].split('.')[0]
        date = d('.news-time').attr('data-time')
        news_time = datetime.datetime.fromtimestamp(float(date))

        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": [],
            "processStatus": 0,
            # "companyId":companyId,
            "companyIds": [],
            "category": None,
            "domain": domain,
            "categoryNames": []
        }

        dcontents = []
        description = d('.item-desc').text()
        if description is not None:
            dc = {
                "rank": 1,
                "content": "创业邦快讯",
                "image": "",
                "image_src": "",
            }

            dcontents.append(dc)
            dc = {
                "rank": 2,
                "content": description,
                "image": "",
                "image_src": "",
            }
            dcontents.append(dc)

            logger.info(description)

        dnews["contents"] = dcontents

        brief = util.get_brief_from_news(dcontents)

        post = util.get_posterId_from_news(dcontents)
        dnews["postId"] = post
        # dnews["post"] = post
        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Esempio n. 15
0
def process(crawler):
    while True:
        if len(URLS) == 0: return
        linkDict = URLS.pop(0)

        retry = 0

        while True:
            retry += 1
            if retry > 6: break
            download_crawler = download.DownloadCrawler(use_proxy=False)
            url = linkDict['href']
            result = crawler.crawl(url)
            if result['get'] == 'success':
                d = pq(html.fromstring(result['content'].decode("utf-8")))

                title = linkDict['title']
                key = url.split('/')[-1]

                category = d('.al-crumbs a:nth-child(2)').text()

                if categoryDict.has_key(category):
                    TYPE = categoryDict[category]['type']
                    category = categoryDict[category]['category']
                else:
                    TYPE = 60001
                    category = None

                brief = linkDict['brief']
                postraw = linkDict['post']

                tags = []
                # for tag in d('.tags').text().split():
                #     if tag.strip() not in tags: tags.append(tag)

                news_time = d('.article__published').eq(0).text()
                # news_time = datetime.datetime.strptime(' '.join(news_time.split(' ')[:2]), '%Y年%m月%d日 %H:%M')
                # news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %p %I:%M')
                news_time = datetime.datetime.strptime(news_time,
                                                       '%Y/%m/%d %H:%M')

                flag, domain = url_helper.get_domain(url)
                dnews = {
                    "date": news_time - datetime.timedelta(hours=8),
                    "title": title,
                    "link": url,
                    "createTime": datetime.datetime.now(),
                    "source": SOURCE,
                    "key": key,
                    "key_int": None,
                    "type": TYPE,
                    "original_tags": tags,
                    "processStatus": 0,
                    # "companyId": None,
                    "companyIds": [],
                    "category": category,
                    "domain": domain,
                    "categoryNames": []
                }

                article = d('.article__content').html()
                contents = extract.extractContents(url, article)
                dcontents = []
                rank = 1
                for c in contents:
                    if c["type"] == "text":
                        dc = {
                            "rank": rank,
                            "content": c["data"],
                            "image": "",
                            "image_src": "",
                        }
                    else:
                        if download_crawler is None:
                            dc = {
                                "rank": rank,
                                "content": "",
                                "image": "",
                                "image_src": c["data"],
                            }
                        else:
                            (imgurl, width,
                             height) = parser_mysql_util.get_logo_id_new(
                                 c["data"], download_crawler, SOURCE, key,
                                 "news")
                            if imgurl is not None:
                                dc = {
                                    "rank": rank,
                                    "content": "",
                                    "image": str(imgurl),
                                    "image_src": "",
                                    "height": int(height),
                                    "width": int(width)
                                }
                            else:
                                continue
                    dcontents.append(dc)
                    rank += 1
                dnews["contents"] = dcontents

                if brief is None or brief.strip() == "":
                    brief = util.get_brief_from_news(dcontents)

                # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
                (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                    postraw, download_crawler, SOURCE, key, "news")
                if posturl is not None:
                    post = str(posturl)
                else:
                    post = None
                if post is None or post.strip() == "":
                    post = util.get_posterId_from_news(dcontents)

                if download_crawler is None:
                    dnews["post"] = post
                else:
                    dnews["postId"] = post

                # brief=brief[:100]
                dnews["brief"] = brief

                mongo = db.connect_mongo()
                collection_news = mongo.article.news
                # update link content with oldId
                item = collection_news.find_one({"link": url})

                if len(dcontents) > 1:
                    if item is None:
                        # collection_news.insert(dnews)
                        nid = parser_mongo_util.save_mongo_news(dnews)
                        logger.info("Done: %s", nid)
                    else:
                        logger.info("update %s", url)
                        #     oldId = collection_news.find_one({"link": url})['_id']
                        #     collection_news.delete_one({"link": url})
                        #     dnews['_id'] = oldId
                        #     collection_news.insert(dnews)
                mongo.close()
                logger.info("%s, %s, %s, %s, %s, %s, %s", key, title,
                            news_time, category, " ".join(tags), brief, post)
                logger.info("*************DONE*************")
                break
Esempio n. 16
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):

        d = pq(html.fromstring(content.decode('gbk', "ignore")))

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        category = None

        title = d('div#postlist> table> tr> td.plc.ptm.pbn> h1').text().strip()
        [author, cleanTitle] = clean_title(title)

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("meta[name='description']").attr("content")
        news_time = None
        post_time = d(
            'div#postlist> div> table.plhin> tr> td> div> div> div.authi> em'
        ).eq(0).text()
        logger.info(post_time)

        # if post_time.find("月") >=0:
        #     dt = datetime.date.today()
        #     today = datetime.datetime(dt.year, dt.month, dt.day)
        #     if post_time is None or datetime.datetime.strptime(post_time, "%Y年%m月%d日") == today:
        #         news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.strptime(
                post_time.replace("发布时间: ", ""), "%Y-%m-%d %H:%M")

        article = d('div.t_fsz> table> tr> td.t_f').eq(0).html()
        # logger.info(article)
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "author": author,
            "cleanTitle": cleanTitle,
            "categoryNames": []
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            # logger.info(c["data"])
            if c["data"].find("image/common/none.gif") >= 0 or c["data"].find(
                    "下载本地保存到信息图册") >= 0:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Esempio n. 17
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        category = None

        title = d('article> h1').text().strip()

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        if "English" in tags or "english" in tags:
            logger.info("Englis not needed, get out!")
            return

        if "商业价值杂志" in tags:
            type = 60003
            category = 60107
        # post = d('div#post_thumbnail> img').attr("src")
        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        # if posturl is not None:
        #     post = str(posturl)
        # else:
        #     post = None

        postraw = d("meta[property='og:image']").attr("content")
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("article> p.post-abstract").text().strip().replace(
            '摘要: ', "")

        post_time = d('article> div.post-info> span.time').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('article> div.inner').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"] == "http://www.tmtpost.com/public/css/images/wzny_ewm.jpg":
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        if title is not None and len(contents) > 0:
            # collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
        mongo.close()
        # logger.info("*************DONE*************")
    return
Esempio n. 18
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        logger.info('here')

        d = pq(html.fromstring(content.decode("utf-8", 'ignore')))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        title = d('div.post-inner> h1').text().strip()

        if title is None or title == "":
            return

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({'title': title}) is not None:
            mongo.close()
            return

        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")

        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        articletags = d("meta[name='keywords']").attr('content')
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time = d("p.post-byline> time.published").text().strip()

            logger.info('时间:%s' % post_time)

            p = re.compile(u'(年|月)')
            post_time = p.sub('-', post_time).replace('日', '')

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.entry-inner').html()

        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames,
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c['data'].find('文章相关引用及参考') >= 0 or c['data'].find(
                    '读者QQ群') >= 0:
                continue

            if c['type'] == 'text':
                dc = {
                    'rank': rank,
                    'content': c['data'],
                    'image': '',
                    'image_src': '',
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            dcontents.append(dc)
            rank += 1

        dnews['contents'] = dcontents

        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        logger.info(
            json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
Esempio n. 19
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].strip().replace(".shtml", "")

        type = TYPE

        category = None

        title = d('div.subject> h1').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []

        postraw = newspost
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("meta[name='description']").attr("content")

        post_time = d('div.meta> span.meta-date').text().replace("发布", "")
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.subject> div.subject-content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Esempio n. 20
0
def process_news(content, url, key, col):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8")))

        title = d('div.cj_content> div.cj_top> div.cj_tit> h2').text().strip().replace("&quo;", "\"")
        datecontent = d('div.cj_content> div.cj_top> div.cj_tit> p.fa').text()
        result = util.re_get_result('(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', datecontent)
        if result:
            post_time, = result
            news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S")
        else:
            logger.info("incorrcet post time")
            return

        try:
            key_int = int(key)
        except:
            key_int = None

        brief = d("meta[name='description']").attr("content").strip()

        if col["column"] == "view":
            type = 60003
        else:
            type = TYPE

        categoryNames = []
        category = col["category"]
        if category == 60105: categoryNames.append("大公司")
        if category == 60101: categoryNames.append("融资")

        tags = []
        keywords = d("meta[name='keywords']").attr("content")
        if keywords is not None:
            for keyword in keywords.split(","):
                if keyword is not None and keyword.strip() not in tags:
                    tags.append(keyword.strip())
        postraw = d('div.cj_content> div.cj_top> img.gg').attr("src")
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time, news_time, brief, ":".join(tags), category, post)

        article = d('div.para_ycont> div.col-xs-12').html()
        # logger.info(article)
        contents = extract.extractContents(url, article)

        if collection_news.find_one({"link": url}) is not None:
            return
            # collection_news.delete_one({"link": url})

        if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
            collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": key_int,
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []


        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""),
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""),
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"].replace("?imageView2/2/w/750/q/90",""), download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Esempio n. 21
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        logger.info('here')

        d = pq(html.fromstring(content.decode("utf-8", 'ignore')))

        if d.text().find('embed') >= 0:  # 排除视频文章
            logger.info('not article:%s' % newsurl)
            return

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        title = d('h1').text().strip()

        if title is None or title == "":
            return

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({'title': title}) is not None:
            mongo.close()
            return

        try:
            (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                newspost, download_crawler, SOURCE, key, "news")
        except:
            posturl = None
        if posturl is not None:

            post = str(posturl)
        else:
            post = None

        tags = []
        articletags = d("meta[name='keywords']").attr('content')
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time_1 = d("div.wyt-post-content-meta> div> p ").find(
                'span').text().strip()
            post_time_2 = d("div.wyt-post-content-meta> div").find(
                'p').next().text().strip()
            if post_time_1:
                post_time = post_time_1
            else:
                post_time = post_time_2

            if re.match('\d{2}-\d{2}', post_time):  # 匹配 03-19格式
                post_time = str(time.localtime()[0]) + '-' + post_time

            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('article.wyt-post-content').html()

        contents = extract.extractContents(newsurl, article, document=True)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames,
        }
        dcontents = []
        rank = 1

        if contents[0]['type'] == 'img':
            del contents[0]

        for c in contents:
            # logger.info("%s-%s",c["type"],c["data"])
            if c['type'] == 'text':

                if re.match('^\d+$', c['data']) or c['data'].find('收藏') >= 0 or c['data'].find('投融资') >= 0 or c['data'].find('阅读时间') >= 0 \
                        or c['data'].find('违者必究') >= 0 or c['data'].find('微信公众号') >= 0 or c['data'].find('微信扫描') >= 0 \
                        or c['data'].find('点击获取完整版报告') >= 0 or c['data'].find('作者原创,微信号') >= 0:
                    continue

                # if c['data'].find('译者') >= 0:
                #     c['data'] = c['data'].split(' ')[0]
                #
                # if c['data'].find('来源') >= 0:
                #     c['data'] = c['data'].split('|')[0]

                if c['data'].find('| 未央网') >= 0:
                    c['data'] = c['data'].replace('| 未央网', ' ')

                dc = {
                    'rank': rank,
                    'content': c['data'],
                    'image': '',
                    'image_src': '',
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1

        dnews['contents'] = dcontents

        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        logger.info(
            json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
Esempio n. 22
0
def process_news(column, d_map, content, download_crawler):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []
        type = TYPE

        key = d('div#sb-site> article').attr('data-id')
        title = d('div#sb-site> article> section#article-header> h2> strong> a'
                  ).text().strip()
        newspost = d(
            'div#sb-site> article> section#article-image> div> figure> img'
        ).attr('src')
        logger.info('newspost:%s' % newspost)
        newsurl = d_map['link']

        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        brief = d("meta[name='description']").attr('content')
        post_time = d('div#sb-site> article> section#article-meta> span> em'
                      ).text().strip()
        news_time = None
        is_re = re.search('(\d{2}-\d{2}-\d{4})', post_time)
        is_re2 = re.search('(\d) hours ago', post_time)
        if is_re:
            news_time = datetime.datetime.strptime(is_re.group(1), "%d-%m-%Y")
        elif is_re2:
            news_time = datetime.datetime.now() - datetime.timedelta(
                hours=int(is_re2.group(1)))
        elif post_time.find('a moment') >= 0:
            news_time = datetime.datetime.now()

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        article = d(
            'div#sb-site> article> section#article-content> div.post-content> div.row'
        ).html()
        is_re3 = re.search("(<strong>DailySocial\.id.*?</p>)", article, re.S)
        if is_re3:
            article = article.replace(is_re3.group(1), '')
        contents = extract.extractContents(newsurl, article, document=False)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("Also Read") >= 0 or c['data'].find(
                    'function()') >= 0:
                continue
            # if c['data'].find('caption') >= 0:
            #     c['data'] = c['data'].replace('caption','')

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        return
Esempio n. 23
0
def process_news(column, newsurl, content, newspost, download_crawler):
    logger.info('starting process_news %s', newsurl)
    # if has_news_content(content):
    if 1:
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].replace('.html', '')

        # type = TYPE

        category = None

        title = d('.single-post-title').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            logger.info('title:%s already exists' % title)
            return

        tags = []
        articletags = d(".mb-2 a").text().strip()
        if articletags is not None:
            for tag in articletags.split():
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        postraw = None
        # post = d('div#post_thumbnail> img').attr("src")
        # if post is not None:
        #     post = "http://luxe.com"+ post

        # brief = d(".intr").text()
        # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip()
        brief = None

        # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content)
        # news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %H:%M:%S')

        news_time = d('.post-meta').text().split()[-1]
        news_time = extract.extracttime(news_time)

        # dt = datetime.date.today()
        today = datetime.datetime.now()
        if news_time is None or news_time > today:
            news_time = datetime.datetime.now()

        article = d('.post-body').html()
        contents = extract.extractContents(newsurl, article, document=False)
        # if len(contents)==0:
        #     contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, postraw)
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     logger.info( 'title:%s already exists'%title)
        #     return

        flag, domain = url_helper.get_domain(newsurl)

        typeNames = d('.breadcrumb-item a').text()
        TYPE = 60001 if typeNames.find(u'金融') >= 0 or typeNames.find(
            u'融资') >= 0 else 60005
        if title.find(u'融资') >= 0 or title.find(u'投资') >= 0:
            TYPE = 60001
            category = 60101
        else:
            category = None

        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": None,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        dnews["brief"] = brief

        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        # update link content with oldId
        item = collection_news.find_one({"link": newsurl})
        if item is None:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
        else:
            logger.info("update %s", newsurl)
            # collection_news.update_many({'link': newsurl},{'$set': dnews})

            # oldId = collection_news.find_one({"link": newsurl})['_id']
            # collection_news.delete_one({"link": newsurl})
            # dnews['_id']=oldId
            # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")

    return
Esempio n. 24
0
def process_news(column, newsurl, content, newspost):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # d = pq(html.fromstring(content.decode("utf-8","ignore")))
        if content.find("charset=GBK") == -1:
            d = pq(html.fromstring(content.decode("utf-8","ignore")))
            utfflag = True
        else:
            d = pq(html.fromstring(content.decode("gbk", "ignore")))
            utfflag = False

        key = newsurl.split("?")[0].split("/")[-1].replace(".shtml","")

        type = TYPE

        category = None
        categoryNames = []

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split():
                if tag is not None and tag.strip() != "" and tag not in tags:
                    tags.append(tag)

        if utfflag is True:
            title = d('article> div> h1').text().strip()
        else:
            title = d('div.titleH> h1').text().strip()
        logger.info("title: %s",title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return


        # post = d('div#post_thumbnail> img').attr("src")
        postraw = newspost
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("meta[name='description']").attr("content")

        if utfflag is True:
            post_time = d('p.source> span.f-right').eq(0).text()
        else:
            post_time = d('div.titleH> p.zsp> span').eq(2).text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        # article = d('div.contdiv').html()
        if utfflag is True:
            article = d('div.post-text').html()
        else:
            article = d('div.contdiv').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("电商资讯第一入口") != -1:
                break
            if c["data"] in Nocontents:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Esempio n. 25
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        category = None
        categoryNames = []

        title = d('div.article-section> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        # post = d('div#post_thumbnail> img').attr("src")
        postraw = newspost
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("meta[name='description']").attr("content")

        post_time = d('li.date>span').text()
        logger.info(post_time)

        if post_time == str(datetime.date.today()):
            news_time = datetime.datetime.now()
        else:
            news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d")

        article = d('div.article-section> div> article').html()
        contents = extract.extractContents(newsurl, article)
        # logger.info(contents)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Esempio n. 26
0
def process_news(column, newsurl, content, newspost, download_crawler, force):
    if has_news_content(content):
        main = pq(content)('div.article_content')
        d = pq(main)

        key = newsurl.split("/")[-1].replace(".html", "")

        title = pq(content)('head> title').text().strip()
        logger.info("title: %s", title)
        # title = d('h1#article_title').text()

        brief = pq(content)("meta[name='description']").attr("content")
        # post_time =pq(content)("meta[property='article:published_time']").attr("content").split("+")[0]
        # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S")
        result = util.re_get_result("var publishTime = new Date\(\"(.*?)\"\)",
                                    content)
        if result:
            post_time, = result
            news_time = datetime.datetime.strptime(post_time,
                                                   "%Y-%m-%d %H:%M:%S")
        else:
            logger.info("incorrcet post time")
            logger.info(content)
            # exit()
            return

        categoryNames = []
        contents = extract.extractContents(newsurl, d.html())
        if title.find("融资") >= 0 or title.find("获投") >= 0:
            category = 60101
            categoryNames.append("融资")
        else:
            category = None
        tags = []

        articletags = pq(content)("meta[name='keywords']").attr(
            "content").replace(";", ",")
        if articletags is None:
            logger.info(content)

        else:
            for tag in articletags.split(","):
                if tag is not None and tag.strip() != "" and tag not in tags:
                    tags.append(tag)

        logger.info("%s, %s, %s, %s, %s, %s", key, title, news_time, category,
                    ":".join(tags), brief)

        if force is True:
            mongo = db.connect_mongo()
            collection_news = mongo.article.news
            collection_news.delete_many({
                "source": SOURCE,
                "key_int": int(key)
            })
            collection_news.delete_many({"title": title})
            mongo.close()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }

        #pjtables
        pjcontents = []
        trs = pq(content)('div.proj_table> table> tr')
        logger.info("*****len of trs %s", len(trs))
        for tr in trs:
            logger.info(tr)
            co = pq(tr).text()
            logger.info(co)
            if co is not None and co.strip() != "":
                pjcontents.append(co.replace(" ", ":"))

        dcontents = []
        rank = 1
        for c in contents:
            if c["data"] == "/The End/":
                break
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1

        for pjc in pjcontents:
            dc = {
                "rank": rank,
                "content": pjc,
                "image": "",
                "image_src": "",
            }
            dcontents.append(dc)
            logger.info(pjc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post

        # if post is None or post.strip() == "":
        post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # id =collection_news.insert(dnews)
        # logger.info("***********id: %s", id)
        # logger.info("*************DONE**************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        mongo.close()
    return
Esempio n. 27
0
def process_news(content, news_key, url, news_posttime):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode('utf-8')))
        title = d('header.article-header>h1').text().strip()
        if title is None or title.strip() == "":
            logger.info("wrong title for url: %s", url)
            return
        post_time = pq(content)("meta[name='sailthru.date']").attr("content")
        news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") + datetime.timedelta(hours=15)

        key = news_key
        try:
            postraw = pq(content)("meta[property='og:image']").attr("content")
            if postraw.find("techcrunch.opengraph.default.png")>=0:
                postraw = None
        except:
            postraw = None
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        divtags = d('div.tags> div.tag-item')
        tags = [pq(divtag)('a.tag').text().strip() for divtag in divtags if pq(divtag)('a.tag').text().strip() is not None]
        category = None
        logger.info("%s, %s, %s, %s, %s -> %s", key, title, post_time, news_time, ":".join(tags),category)

        article = d('div.article-entry.text').html()
        # logger.info(article)
        contents = extract.extractContents(url, article)

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None:
            mongo.close()
            return
            # collection_news.delete_one({"source": SOURCE, "key_int": int(key)})
        if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
            mongo.close()
            return
            # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        mongo.close()

        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "" or post.find("techcrunch.opengraph.default.png")>=0:
        #     post = util.get_poster_from_news(dcontents)
        #
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        if len(dcontents) > 0:
            # mongo = db.connect_mongo()
            # collection_news = mongo.article.news
            # collection_news.insert(dnews)
            # mongo.close()
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)

        logger.info("Done")
Esempio n. 28
0
    def process_news(self, newsurl, content, download_crawler):
        if self.has_news_content(content):
            try:
                d = pq(html.fromstring(content.decode("utf-8")))
            except:
                d = pq(html.fromstring(content))

            key = newsurl.split("/")[-1].replace(".shtml","").replace(".html","")
            try:
                key_int = int(key)
            except:
                key_int = None

            news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content)
            if news_time is None:
                news_time = datetime.datetime.now()

            title = extract.extractTitle(content)

            contents = extract.extractContents(newsurl, content)

            tags = []
            try:
                articletags = d("meta[name='keywords']").attr("content")
                if articletags is not None:
                    for tag in articletags.split():
                        if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
                            tags.append(tag)
            except:
                pass

            logger.info("News: %s, %s, %s", key, title, news_time)

            # mongo = db.connect_mongo()
            # collection_news = mongo.article.news
            # if collection_news.find_one({"link": newsurl}) is not None:
            #     mongo.close()
            #     return

            flag, domain = url_helper.get_domain(newsurl)
            dnews = {
                "date": news_time - datetime.timedelta(hours=8),
                "title": title,
                "link": newsurl,
                "createTime": datetime.datetime.now(),
                "source": self.SOURCE,
                "key": key,
                "key_int": key_int,
                "type": self.TYPE,
                "original_tags": tags,
                "processStatus": 0,
                # "companyId": None,
                "companyIds": [],
                "category": self.CATEGORY,
                "domain": domain,
                "categoryNames": []
            }
            dcontents = []
            rank = 1
            for c in contents:

                if c["type"] == "text":
                    dc = {
                        "rank": rank,
                        "content": c["data"],
                        "image": "",
                        "image_src": "",
                    }
                else:
                    if download_crawler is None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": "",
                            "image_src": c["data"],
                        }
                    else:
                        # imgurl = parser_mysql_util.get_logo_id(c["data"], download_crawler, self.SOURCE, key, "news")
                        (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler,
                                                                                    self.SOURCE,
                                                                                    key, "news")
                        if imgurl is not None:
                            dc = {
                                "rank": rank,
                                "content": "",
                                "image": str(imgurl),
                                "image_src": "",
                                "height": int(height),
                                "width": int(width)
                            }
                        else:
                            continue

                logger.info(c["data"])
                dcontents.append(dc)
                rank += 1
            dnews["contents"] = dcontents

            brief = util.get_brief_from_news(dcontents)
            post = util.get_poster_from_news(dcontents)
            if download_crawler is None:
                dnews["post"] = post
            else:
                dnews["postId"] = post
            dnews["brief"] = brief

            # if news_time > datetime.datetime.now() or news_time < datetime.datetime.now() - datetime.timedelta(days=30):
            #     logger.info("Time: %s is not correct with current time", news_time)
            #     dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)

            if news_time > datetime.datetime.now():
                logger.info("Time: %s is not correct with current time", news_time)
                dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
            if len(dnews["contents"])> 2:
                # mongo = db.connect_mongo()
                # collection_news = mongo.article.news
                # collection_news.insert(dnews)
                # mongo.close()
                nid = parser_mongo_util.save_mongo_news(dnews)
                logger.info("Done: %s", nid)
            logger.info("*************DONE*************")
Esempio n. 29
0
    if post is None or post.strip() == "":
        post = util.get_posterId_from_news(dcontents)

    if download_crawler is None:
        dnews["post"] = post
    else:
        dnews["postId"] = post
    dnews["brief"] = brief

    if news_time > datetime.datetime.now():
        logger.info("Time: %s is not correct with current time", news_time)
        dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
    # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder))

    if title is not None and len(contents) > 0:
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        pass
    return


def crawler_news(column, crawler, d_map, download_crawler):
    global Proxies
    retry = 0
    while True:
        if Proxies is None:
            Proxies = get_proxy(http_type='https')
        logger.info('---->retry:%d<----' % retry)
        try:
            newsurl = d_map['link']
            logger.info('crawl url:%s' % newsurl)
Esempio n. 30
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        key = newsurl.split("/")[-1]

        type = TYPE

        category = None

        title = d('div.post> div.post-title> h1.title').text().strip()

        # logger.info("title: %s", title)
        if title is None or title == "":
            return
        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.replace(",", ",").split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        # post = d('div#post_thumbnail> img').attr("src")
        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        # post = d("meta[property='og:image']").attr("content")

        brief = d("meta[name='description']").attr("content")

        news_time = None
        try:
            post_time = d(
                'div.post> div.post-title> div> span.postclock').text()
            logger.info("post-time: %s", post_time)
            # for span in spans:
            #     if d(span).text() is not None and d(span).text().find("日期") >= 0:
            #         post_time = d(span).text().replace("日期:","").strip()
            #         logger.info(post_time)
            #         try:
            #             news_time = datetime.datetime.strptime(post_time,"%Y年%m月%d日 %H:%M")
            #         except Exception, e:
            #             logger.info(e)
            #             pass
            #         break
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except:
            pass
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.post> div.post-content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": [],
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("转载请联系原出处") >= 0 or c["data"].find(
                    "网页转载须在文首") >= 0:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        mid = None
        if title is not None and len(contents) > 0:
            # mid = collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
        # logger.info("*************DONE*************%s",mid)
    return