Exemple #1
0
def crawler_rp(nurl, nctitle, ndate, sourceId, source):
    retry = 0
    while True:
        result = espcralwer.crawl(nurl, agent=True)
        if result['get'] == 'success':
            if result["redirect_url"].find("pdf") >= 0:
                logger.info("we got pdf : %s ", result["redirect_url"])
                fileTime = extract.extracttime(ndate.split("+")[0])
                content = {
                    'stockExchangeId': 2 if source == 13401 else 3,
                    'source': source,
                    'stockSymbol': str(sourceId),
                    'title': nctitle,
                    'link': result["redirect_url"],
                    "date": fileTime - datetime.timedelta(hours=8),
                    'createTime': fileTime - datetime.timedelta(hours=8),
                }

                mongo = db.connect_mongo()
                collection = mongo.stock.announcement
                collection.insert(content)
                mongo.close()
            else:
                try:
                    d = pq(
                        html.fromstring(result["content"].decode(
                            'gbk', 'ignore')))
                    filelink = d('div.detail-header> h1> span> a').attr("href")
                    if filelink is not None and filelink.find("pdf") >= 0:
                        fileTime = extract.extracttime(ndate.split("+")[0])

                        content = {
                            'stockExchangeId': 2 if source == 13401 else 3,
                            'source': source,
                            'stockSymbol': str(sourceId),
                            'title': nctitle,
                            'link': filelink,
                            "date": fileTime - datetime.timedelta(hours=8),
                            'createTime':
                            fileTime - datetime.timedelta(hours=8),
                        }

                        mongo = db.connect_mongo()
                        collection = mongo.stock.announcement
                        collection.insert(content)
                        mongo.close()
                        # exit()
                except Exception, ex:
                    logger.exception(ex)
            break

        retry += 1
        if retry > 8: break
Exemple #2
0
def process(content, sourceId, source, key):
    j = json.loads(content)
    infos = j["result"]
    mongo = db.connect_mongo()

    collection = mongo.stock.announcement
    for info in infos:
        ntitle = info["title"]
        ndate = info["date"].replace("T", " ")
        fileTime = extract.extracttime(ndate)

        nurl = info["attachUrl"]
        cleantitle = "[临时公告]" + ntitle
        cleantitle1 = "[定期报告]" + ntitle
        cleantitle2 = "[临时报告]" + ntitle
        logger.info("%s-%s-%s", ntitle, cleantitle, ndate)
        if fileTime > datetime.datetime(2017, 01, 11):
            logger.info("******we have")
            continue
        item = collection.find_one(
            {"title": {
                '$in': [cleantitle1, cleantitle2, cleantitle]
            }})
        item1 = collection.find_one({"title": ntitle})
        if item is not None or item1 is not None:
            logger.info("******already exists")
        else:
            logger.info("******missing, get it")
            crawler_rp(nurl, cleantitle, ndate, sourceId, source)
Exemple #3
0
def get_link(crawler, concurrent_num, contentcrawler):
    for page in xrange(5):
        url = 'http://lanxiongsports.com/mservice/?c=news&format=json&page=%s' % (
            page + 1)
        while True:
            result = crawler.crawl(url)
            if result['get'] == 'success':
                j = json.loads(result['content'])

                for item in j['items']:
                    if item.has_key('ad_code'): continue
                    key = item['id']
                    title = item['title']
                    post = item['logo']
                    brief = item['summary']
                    category = item['_category']['name']
                    date = item['created_at']

                    if not isinstance(date, datetime.datetime):
                        logger.info('%s not datetime', date)
                        date = extract.extracttime(date)

                    href = 'http://lanxiongsports.com/?c=posts&a=view&id=%s' % key
                    linkDict = {
                        "href": href,
                        "title": title,
                        "post": post,
                        "brief": brief,
                        "category": category,
                        "date": date,
                    }

                    mongo = db.connect_mongo()
                    collection_news = mongo.article.news

                    item = collection_news.find_one({
                        "link": href,
                        'title': title
                    })
                    if item is None:
                        # logger.info( 'not exists %s ,%s '%(href,title))
                        URLS.append(linkDict)
                    else:
                        logger.info('already exists %s , %s', href, title)
                    mongo.close()

                break

        if len(URLS) == 0 and page > 0:
            logger.info('page %s got no fresh news,quiting............',
                        page + 1)
            break

        threads = [
            gevent.spawn(process, contentcrawler)
            for i in xrange(concurrent_num)
        ]
        gevent.joinall(threads)
Exemple #4
0
def process(content):
    cnt = 0
    res = content.replace('var szzbAffiches=', '')[:-2]
    # logger.info(res)
    infos = eval(res.decode("gbk").strip())
    # logger.info(contentnew)
    cnt = 0
    if len(infos) == 0:
        return 0
    for info in infos:
        # logger.info(info)
        # logger.info(type(info))
        if len(info) < 4:
            continue
        try:
            stockid = info[0]
            filelink = "http://disclosure.szse.cn/" + info[1]
            filetitle = info[2]
            fileTime = extract.extracttime(info[-1])

            content = {
                'stockExchangeId': 3,
                'source': 13402,
                'stockSymbol': str(stockid),
                'title': filetitle,
                'link': filelink,
                "date": fileTime - datetime.timedelta(hours=8),
                'createTime': datetime.datetime.now(),
            }

            # check mongo data if link is existed
            mongo = db.connect_mongo()

            collection = mongo.stock.announcement
            item = collection.find_one({"link": filelink})
            if item is None:
                collection.insert(content)
                cnt += 1
            else:
                logger.info("already exists file: %s", filelink)

            mongo.close()

            logger.info("Stock: %s, file: %s|%s|%s", stockid, filetitle,
                        fileTime, filelink)
        except Exception, e:
            logger.info(e)
            logger.info("cannot get info")
Exemple #5
0
def process(content, flag, type):
    if content.find("result") >= 0:
        # logger.info(content)
        d = pq(html.fromstring(content.replace("&nbsp;", "bamy").decode("utf-8")))
        for a in d('div> div.result'):
            try:
                link = d(a)('h3> a').attr("href")
                title = "".join(d(a)('h3> a').text().split())
                # logger.info(link)
                if title is not None and title.strip() != "":
                    # logger.info("Link: %s is right news link %s", link, title)
                    # title = d(a)('h3> a').text()
                    if type == 'title':
                        ndate = d(a)('div.c-title-author').text().split("bamybamy")[1].replace("查看更多相关新闻>>", "").strip()
                    else:
                        ndate = d(a)('.c-author').text().split("bamybamy")[1].replace("查看更多相关新闻>>", "").strip()
                    newsdate = extract.extracttime(ndate)
                    newsdate = newsdate - datetime.timedelta(hours=8) if newsdate is not None else newsdate
                    # newsdate = datetime.datetime.strptime(ndate, "%Y年%m月%d日 %H:%M") - datetime.timedelta(hours=8)
                    # ndate = d(a)('div.c-title-author').text()
                    logger.info("Link: %s is right news link %s|%s|%s", link, title, ndate, type)
                    mongo = db.connect_mongo()
                    collection_news = mongo.article.news
                    item = collection_news.find_one({'$or': [{"link": link}, {'title': title}]})
                    collection_news_more = mongo.article.news_more
                    item2 = collection_news_more.find_one({'$or': [{"link": link}, {'title': title}]})
                    mongo.close()

                    if ((item is None and item2 is None) or flag == "all") and link not in URLS:
                        linkmap = {
                            "link": link,
                            "title": title,
                            "newsdate": newsdate
                        }
                        URLS.append(linkmap)
                    else:
                        logger.info('already exists %s', title)
                    #     if item is not None:
                    #         add_companyIds(item["link"], companyId)
                    #         add_newsdate(item["link"], newsdate)
                    #     elif item2 is not None:
                    #         add_companyIds(item2["link"], companyId)
                    #         add_newsdate(item2["link"], newsdate)
                else:
                    pass
            except Exception, e:
                logger.info(e)
                logger.info("cannot get link")
Exemple #6
0
def process(content):
    res = content.replace('null([', '')[:-2]
    j = json.loads(res)
    # logger.info(j)
    infos = j["listInfo"]["content"]
    cnt = 0
    if len(infos) == 0:
        return cnt
    mongo = db.connect_mongo()
    # collection = mongo.stock.neeq_announcement
    collection = mongo.stock.announcement
    for info in infos:
        try:
            stockid = info["companyCd"]
            stockName = info["companyName"]
            filelink = "http://www.neeq.com.cn" + info["destFilePath"]
            filetitle = info["disclosureTitle"]
            fileTime = extract.extracttime(str(info["upDate"]["time"]))
            logger.info("Stock: %s|%s, file: %s|%s|%s", stockid, stockName,
                        filetitle, filelink, fileTime)
            item = collection.find_one({"link": filelink})
            if item is None:
                item = {
                    # "source": SOURCE,
                    # "sourceId": int(stockid),
                    # "title": filetitle,
                    # "link": filelink,
                    # "date": fileTime- datetime.timedelta(hours=8),
                    # "createTime": datetime.datetime.now()
                    'stockExchangeId': 1,
                    'source': 13400,
                    'stockSymbol': str(stockid),
                    'title': filetitle,
                    'link': filelink,
                    "date": fileTime - datetime.timedelta(hours=8),
                    'createTime': datetime.datetime.now(),
                }
                collection.insert(item)
                cnt += 1

            logger.info("Stock: %s|%s, file: %s|%s|%s", stockid, stockName,
                        filetitle, filelink, fileTime)
        except Exception, e:
            logger.info(e)
            logger.info("cannot get info")
Exemple #7
0
def crawler_rp(nurl, nctitle, ndate, sourceId, source):
    filelink = nurl
    if filelink is not None and filelink.find("pdf") >= 0:
        fileTime = extract.extracttime(ndate)

        content = {
            'stockExchangeId': 1,
            'source': source,
            'stockSymbol': str(sourceId),
            'title': nctitle,
            'link': filelink,
            "date": fileTime - datetime.timedelta(hours=8),
            'createTime': fileTime - datetime.timedelta(hours=8),
        }
        logger.info(
            json.dumps(content, ensure_ascii=False, cls=util.CJsonEncoder))
        mongo = db.connect_mongo()
        collection = mongo.stock.announcement
        collection.insert(content)
        mongo.close()
Exemple #8
0
def process_news(content, download_crawler):
    download_crawler = download.DownloadCrawler(use_proxy=False)

    category = None
    categoryNames = []

    key = content['id']
    type = TYPE
    title = content['title']

    mongo = db.connect_mongo()
    collection_news = mongo.article.news
    if collection_news.find_one({"title": title}) is not None:
        mongo.close()
        return
    newspost = content.get('featured_image').get('source')
    (posturl, width,
     height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler,
                                                 SOURCE, key, "news")
    if posturl is not None:
        post = str(posturl)
    else:
        post = None
    # logger.info(post)

    tags = []
    for tag in content['tags']:
        tags.append(tag['name'])
    brief = content['seo']['description']
    try:
        post_time = content['modified_gmt']
        news_time = None
        if post_time.find('T'):
            post_time = post_time.replace('T', ' ')
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
    except Exception, e:
        logger.info(e)
        news_time = datetime.datetime.now()
Exemple #9
0
def process_news(column, newsurl, content, newspost):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        key = content["news"]["id"]

        newsurl = "https://www.chinaventure.com.cn/cmsmodel/news/detail/%s.shtml" % key

        type = TYPE

        category = None
        categoryNames = []
        if content["news"].has_key("newsChannelId"):
            if content["news"]["newsChannelId"] == 52:
                category = 60101
                categoryNames.append("融资")

        if content["news"].has_key("tagName"):
            if content["news"]["tagName"] == '人物':
                category = 60103

        tags = []
        if content.has_key("keywordList") is True and len(
                content["keywordList"]) > 0:
            for tag in content["keywordList"]:
                if tag.has_key("keyword") and tag[
                        "keyword"] is not None and tag["keyword"].strip(
                        ) != "" and tag["keyword"] not in tags:
                    tags.append(tag["keyword"])

        title = content["news"]["title"].replace("&quot;", "\"")

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            logger.info(
                "***************************News existed!!!***********************"
            )
            mongo.close()
            return

        # post = d('div#post_thumbnail> img').attr("src")
        postraw = "http://pic.chinaventure.com.cn/" + content["news"][
            "coverImg"]
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = content["news"]["introduction"]

        post_time = content["news"]["updateAt"]

        news_time = extract.extracttime(str(post_time))
        if news_time is None:
            news_time = datetime.datetime.now()

        article = pq(content["news"]["content"]).html()
        contents = extract.extractContents(newsurl, article)
        # for c in contents:
        #     logger.info(c["data"])
        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, post)
        # return
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     logger.info("***************************News existed!!!***********************")
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("img.mp.itc.cn") >= 0:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        # dnews["post"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        # logger.info("*************DONE*************")
    return
Exemple #10
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8","ignore")))

        key = newsurl.split("/")[-1].replace("i","")

        type = TYPE

        category = None
        title = d('head> title').text().strip()

        r = "content: '(.*?)',.*groupId"

        result = util.re_get_result(r.strip()[:-1], content)
        (b,) = result
        logger.info(b)

        # exit()
        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.replace(",", ",").split(","):
                if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
                    tags.append(tag)



        post = None

        brief = None
        news_time = None
        try:
            r1 = "time: '(.*?)'.*},.*tagInfo"

            result = util.re_get_result(r1, content)
            (post_time,) = result
            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except:
            pass
        if news_time is None:
            news_time = datetime.datetime.now()
        # exit()
        # article = d('div.post> div.post-content').html()
        # contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": 60101,
            "domain": domain,
            "categoryNames": [],
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        bb = b.replace('&lt;', "<").replace("&gt;",">").replace("&quot;","\"").replace("&#x3D;","=")
        logger.info(bb)

        contents = extract.extractContents(newsurl, bb, document=False)
        for c in contents:
            logger.info(c["data"])
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        # for c in b.replace("&lt;div&gt;&lt;p&gt;",'').replace("&lt;/p&gt;&lt;/div&gt;","").split('&lt;/p&gt;&lt;p&gt;'):
        #     logger.info(c)
        #     if c.find("转载务必署名来源")>=0 or c.find("&lt;/p&gt;&lt;/div&gt;")>=0 or c.find("&lt;div&gt;&lt;p&gt; ")>=0:
        #         continue
        #     if c.find("img") >= 0:
        #         c = re.sub(r'&lt;(.*)?img.*&quot;0&quot;&gt;',"",c)
        #         dc = {
        #             "rank": rank,
        #             "content": c,
        #             "image": "",
        #             "image_src": "",
        #         }
        #     else:
        #         dc = {
        #             "rank": rank,
        #             "content": c,
        #             "image": "",
        #             "image_src": "",
        #         }
        #     # else:
        #     #     if download_crawler is None:
        #     #         dc = {
        #     #             "rank": rank,
        #     #             "content": "",
        #     #             "image": "",
        #     #             "image_src": c,
        #     #         }
        #     #     else:
        #     #         (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c, download_crawler, SOURCE, key, "news")
        #     #         if imgurl is not None:
        #     #             dc = {
        #     #                 "rank": rank,
        #     #                 "content": "",
        #     #                 "image": str(imgurl),
        #     #                 "image_src": "",
        #     #                 "height": int(height),
        #     #                 "width": int(width)
        #     #             }
        #     #         else:
        #     #             continue
        #
        #     logger.info(c)
        #     dcontents.append(dc)
        #     rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        mid = None
        if title is not None and len(dcontents) > 0:
            # mid = collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
        # logger.info("*************DONE*************%s",mid)
    return
Exemple #11
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        # logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        j = json.loads(content)
        d = j["obj"]["current"]

        category = None
        categoryNames = []

        key = newsurl.split("?")[0].split("/")[-1]

        type = TYPE

        title = d["title"].strip()
        if title.find("融资") >= 0:
            type = 60001
            category = 60101

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        articletags = d["tags"]
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        brief = d["synopsis"]

        try:
            post_time = topic

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        article = d["content"]
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": str(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("追究法律责任") >= 0 or \
                    c["data"].find("details") >= 0 or \
                    c["data"].find("转载") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
Exemple #12
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        # logger.info(content)
        # logger.info(content.decode('ISO-8859-1').encode('utf-8'))
        d = pq(html.fromstring(content.decode('utf-8', "ignore")))
        # d = pq(html.fromstring(content))
        key = newsurl.split("/")[-1].replace(".shtml", "")

        type = TYPE

        if column["category"] == 60003:
            type = 60003
            category = 60107
        else:
            category = column["category"]

        title = d('div.content> div.main_c> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("meta[name='description']").attr("content")

        post_time = d(
            'div.content> div.main_c> div.article_info> div.infos> span.time'
        ).text().replace(".", "-")
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None: news_time = datetime.datetime.now()
        # else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d")

        article = d('div.content> div.main_c> div#content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            # "sectors": [20]
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Exemple #13
0
def process_news(column, newsurl, content, newspost, download_crawler):
    logger.info('starting process_news %s', newsurl)
    # if has_news_content(content):
    if 1:
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].replace('.html', '')

        # type = TYPE

        category = None

        title = d('.single-post-title').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            logger.info('title:%s already exists' % title)
            return

        tags = []
        articletags = d(".mb-2 a").text().strip()
        if articletags is not None:
            for tag in articletags.split():
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        postraw = None
        # post = d('div#post_thumbnail> img').attr("src")
        # if post is not None:
        #     post = "http://luxe.com"+ post

        # brief = d(".intr").text()
        # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip()
        brief = None

        # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content)
        # news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %H:%M:%S')

        news_time = d('.post-meta').text().split()[-1]
        news_time = extract.extracttime(news_time)

        # dt = datetime.date.today()
        today = datetime.datetime.now()
        if news_time is None or news_time > today:
            news_time = datetime.datetime.now()

        article = d('.post-body').html()
        contents = extract.extractContents(newsurl, article, document=False)
        # if len(contents)==0:
        #     contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, postraw)
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     logger.info( 'title:%s already exists'%title)
        #     return

        flag, domain = url_helper.get_domain(newsurl)

        typeNames = d('.breadcrumb-item a').text()
        TYPE = 60001 if typeNames.find(u'金融') >= 0 or typeNames.find(
            u'融资') >= 0 else 60005
        if title.find(u'融资') >= 0 or title.find(u'投资') >= 0:
            TYPE = 60001
            category = 60101
        else:
            category = None

        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": None,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        dnews["brief"] = brief

        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        # update link content with oldId
        item = collection_news.find_one({"link": newsurl})
        if item is None:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
        else:
            logger.info("update %s", newsurl)
            # collection_news.update_many({'link': newsurl},{'$set': dnews})

            # oldId = collection_news.find_one({"link": newsurl})['_id']
            # collection_news.delete_one({"link": newsurl})
            # dnews['_id']=oldId
            # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")

    return
Exemple #14
0
def process_news(item, url, content):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))
        titleraw = d('head> title').text().strip()
        temp = titleraw.split("|")
        title = temp[0].strip()
        # title = d('h1.article-title').text().strip()
        if item is None:
            news_time = extractArticlePublishedDate.extractArticlePublishedDate(
                url, content)
            if news_time is None:
                news_time = datetime.datetime.now()
            key = url.split("/")[-1].replace(".html", "")
        else:
            news_time = extract.extracttime(item["post_date"])
            if news_time is None:
                news_time = datetime.datetime.now()

            key = item["key"]

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({
                "source": SOURCE,
                "key_int": int(key)
        }) is not None:
            mongo.close()
            return
            # collection_news.delete_one({"source": SOURCE, "key_int": int(key)})

        if collection_news.find_one({
                "title": title,
                "source": {
                    "$ne": SOURCE
                }
        }) is not None:
            mongo.close()
            return
            # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        mongo.close()

        #column = d('div.main-text> div.addd-fl-tag').text()
        # column = d('div.article-source> span.article-tag-top').text()
        if isinstance(item, dict):
            column = item["columns"]
        else:
            column = None
        brief = d('div.article-digest').text()
        if column is not None and column.strip() != "":
            tags = column.split()
        else:
            tags = []

        categoryNames = []
        if "课堂" in tags or "专栏" in tags:
            TYPE = 60003
            category = 60107
        else:
            TYPE = 60001
            if "融资汇" in tags:
                category = 60101
                categoryNames.append("融资")
            elif "早期项目" in tags:
                categoryNames.append("早期项目")
                if title.find("融资") >= 0:
                    category = 60101
                    categoryNames.append("融资")
                else:
                    category = 60102
            elif "A轮后" in tags and title.find("融资") >= 0:
                category = 60101
                categoryNames.append("融资")
            elif "大公司" in tags:
                category = 60105
                categoryNames.append("大公司")
            elif "投行" in tags:
                category = 60104
                categoryNames.append("投资人观点")
            else:
                category = None

        # tagsmore = d('div.article-tag> ul').text().split()
        # for a in tagsmore:
        #     if a not in tags:
        #         tags.append(a)
        if isinstance(item, dict):
            postraw = item["post"]
            # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
            (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                postraw, download_crawler, SOURCE, key, "news")
        else:
            posturl = None
        if posturl is not None:
            post = str(posturl)
        else:
            post = None
        #article_img = d('div.article> div.main-text> p> img').attr('src')

        logger.info("%s, %s, %s, %s, %s, %s", key, title, news_time, TYPE,
                    category, ":".join(tags))
        article = d('div.article-main > div.main-text').html()
        #logger.info(article)
        contents = extract.extractContents(url, article, document=False)

        # dcontents = []
        # if article_img is not None:
        #     dc = {
        #         "rank": 1,
        #         "content": "",
        #         "image": "",
        #         "image_src": article_img,
        #     }
        #     dcontents.append(dc)
        #     logger.info(article_img)
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None:
        #     mongo.close()
        #     return
        #     # collection_news.delete_one({"source": SOURCE, "key_int": int(key)})
        #
        # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
        #     mongo.close()
        #     return
        #     # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        # mongo.close()
        # for t in contents:
        #    logger.info(t["data"])
        #    logger.info("")
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            if column is not None and c["data"].strip() == column.strip():
                continue
            if c["data"].find("default/images/theme/company_code.jpg") >= 0:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "" or desc_helper.check_desc(
                brief, 2) is False:
            brief = util.get_brief_from_news(dcontents)
        # post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Exemple #15
0
def process_news(newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        logger.info('here.')

        d = pq(html.fromstring(content.decode("utf-8", 'ignore')))

        category = None
        categoryNames = []
        Type = TYPE
        tags = []
        brief = None

        title = d('h1').text().strip()
        if title is None or title == "":
            return
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({'title': title}) is not None:
            mongo.close()
            return

        key = d('article').attr('id').strip().split('-')[-1]

        try:
            (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                newspost, download_crawler, SOURCE, key, "news")
        except:
            posturl = None
        if posturl is not None:
            post = str(posturl)
        else:
            post = None
        try:
            post_time = d("header> div> span> time").text().strip()
            res = re.search(u'(\d{4})年(\d+)月(\d+)日', post_time)
            year = res.group(1)
            month = res.group(2)
            if len(month) == 1:
                month = '0' + month
            day = res.group(3)
            if len(day) == 1:
                day = '0' + day
            post_time = '{}-{}-{}'.format(year, month, day)
            news_time = extract.extracttime(post_time)

        except Exception as e:
            logger.info(e)
            news_time = datetime.datetime.now()
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.td-post-content').html()
        contents = extract.extractContents(newsurl, article, document=True)

        flag, domain = url_helper.get_domain(newsurl)

        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": Type,
            "original_tags": tags,
            "processStatus": 0,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames,
        }

        dcontents = []
        rank = 1

        for c in contents:
            if c['type'] == 'text':
                dc = {
                    'rank': rank,
                    'content': c['data'],
                    'image': '',
                    'image_src': '',
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1

        dnews['contents'] = dcontents

        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        logger.info(
            json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
Exemple #16
0
def process_news(column, newsurl, content):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1]

        category = None
        categoryNames = []
        if column.has_key("column") and column["column"] in [
                "guandian", "guancha"
        ]:
            type = 60003
            category = 60107
        else:
            type = TYPE

        if column.has_key("column") and column["column"] in ["touzirongzi"]:
            category = 60101
            categoryNames.append("融资")
        # else:
        #     category = None

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split():
                if tag is not None and tag.strip() != "" and tag not in tags:
                    tags.append(tag)

        title = d('div#post_content> div> div#post_title').text().strip()
        if title is None or title.strip() == "":
            title = d('div#post_content> div> h1#post_title').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        postraw = d('div#post_thumbnail> img').attr("src")
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("meta[name='description']").attr("content")

        post_time = d('div#post_info> div> div#post_date').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div#post_description').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("不代表亿欧对观点赞同或支持") != -1 or c["data"] == "5元":
                break
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Exemple #17
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        category = None

        title = d('article> h1').text().strip()

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        if "English" in tags or "english" in tags:
            logger.info("Englis not needed, get out!")
            return

        if "商业价值杂志" in tags:
            type = 60003
            category = 60107
        # post = d('div#post_thumbnail> img').attr("src")
        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        # if posturl is not None:
        #     post = str(posturl)
        # else:
        #     post = None

        postraw = d("meta[property='og:image']").attr("content")
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("article> p.post-abstract").text().strip().replace(
            '摘要: ', "")

        post_time = d('article> div.post-info> span.time').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('article> div.inner').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"] == "http://www.tmtpost.com/public/css/images/wzny_ewm.jpg":
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        if title is not None and len(contents) > 0:
            # collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
        mongo.close()
        # logger.info("*************DONE*************")
    return
Exemple #18
0
def process_news(column, newsurl, content, newspost, download_crawler, sort):
    if has_news_content(content):
        logger.info("here")
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1]

        type = TYPE
        if sort.find("投融资") >= 0:
            type = 60001
        category = None

        title = d('div.m-sd-post-box> h1').text().strip()

        if title is None or title == "":
            return
        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.replace(",", ",").split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)
        # #

        # (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
        # if posturl is not None:
        #     post = str(posturl)
        # else:
        #     post = None
        post = None

        # post = d("meta[property='og:image']").attr("content")
        # try:
        #     brief = d("meta[name='description']").attr("content")
        # except:
        #     brief = None
        brief = None

        try:
            post_time = d('span.time').text()
            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except:
            news_time = datetime.datetime.now()

        article = d('div.g-post-content').html()
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s, %s, %s", key, title, news_time,
                    ":".join(tags), type, category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": [],
            "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            # if c["data"].find("◆END◆")>=0 or c["data"].find("…………………")>=0:
            #     break
            #
            # if c["data"].find("ACG 领域最具影响力的产业新媒体") >= 0 or c["data"].find("访问三文娱网站3wyu.com查看产业必读文章") >=0:
            #     continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        if title is not None and len(contents) > 0:
            # mid = collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
        # logger.info("*************DONE************* %s", mid)
    return
Exemple #19
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        key = newsurl.split("/")[-1]

        type = TYPE

        category = None

        title = d('div.post> div.post-title> h1.title').text().strip()

        # logger.info("title: %s", title)
        if title is None or title == "":
            return
        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.replace(",", ",").split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        # post = d('div#post_thumbnail> img').attr("src")
        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        # post = d("meta[property='og:image']").attr("content")

        brief = d("meta[name='description']").attr("content")

        news_time = None
        try:
            post_time = d(
                'div.post> div.post-title> div> span.postclock').text()
            logger.info("post-time: %s", post_time)
            # for span in spans:
            #     if d(span).text() is not None and d(span).text().find("日期") >= 0:
            #         post_time = d(span).text().replace("日期:","").strip()
            #         logger.info(post_time)
            #         try:
            #             news_time = datetime.datetime.strptime(post_time,"%Y年%m月%d日 %H:%M")
            #         except Exception, e:
            #             logger.info(e)
            #             pass
            #         break
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except:
            pass
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.post> div.post-content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": [],
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("转载请联系原出处") >= 0 or c["data"].find(
                    "网页转载须在文首") >= 0:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        mid = None
        if title is not None and len(contents) > 0:
            # mid = collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
        # logger.info("*************DONE*************%s",mid)
    return
Exemple #20
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        key = content["id"]

        type = TYPE

        category = None

        title = content["title"]

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        try:
            tags = content["keywords"].split(",")
        except:
            tags = []

        postraw = newspost
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = content["description"]

        post_time = content["pubdate"]
        news_time = extract.extracttime(str(post_time))
        if news_time is None:
            news_time = datetime.datetime.now()

        article = pq(content["content"]).html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Exemple #21
0
def process_news(column, newsurl, content, newspost):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # d = pq(html.fromstring(content.decode("utf-8","ignore")))
        if content.find("charset=GBK") == -1:
            d = pq(html.fromstring(content.decode("utf-8","ignore")))
            utfflag = True
        else:
            d = pq(html.fromstring(content.decode("gbk", "ignore")))
            utfflag = False

        key = newsurl.split("?")[0].split("/")[-1].replace(".shtml","")

        type = TYPE

        category = None
        categoryNames = []

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split():
                if tag is not None and tag.strip() != "" and tag not in tags:
                    tags.append(tag)

        if utfflag is True:
            title = d('article> div> h1').text().strip()
        else:
            title = d('div.titleH> h1').text().strip()
        logger.info("title: %s",title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return


        # post = d('div#post_thumbnail> img').attr("src")
        postraw = newspost
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("meta[name='description']").attr("content")

        if utfflag is True:
            post_time = d('p.source> span.f-right').eq(0).text()
        else:
            post_time = d('div.titleH> p.zsp> span').eq(2).text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        # article = d('div.contdiv').html()
        if utfflag is True:
            article = d('div.post-text').html()
        else:
            article = d('div.contdiv').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("电商资讯第一入口") != -1:
                break
            if c["data"] in Nocontents:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Exemple #22
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        logger.info('here')

        d = pq(html.fromstring(content.decode("utf-8", 'ignore')))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        title = d('div.post-inner> h1').text().strip()

        if title is None or title == "":
            return

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({'title': title}) is not None:
            mongo.close()
            return

        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")

        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        articletags = d("meta[name='keywords']").attr('content')
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time = d("p.post-byline> time.published").text().strip()

            logger.info('时间:%s' % post_time)

            p = re.compile(u'(年|月)')
            post_time = p.sub('-', post_time).replace('日', '')

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.entry-inner').html()

        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames,
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c['data'].find('文章相关引用及参考') >= 0 or c['data'].find(
                    '读者QQ群') >= 0:
                continue

            if c['type'] == 'text':
                dc = {
                    'rank': rank,
                    'content': c['data'],
                    'image': '',
                    'image_src': '',
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            dcontents.append(dc)
            rank += 1

        dnews['contents'] = dcontents

        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        logger.info(
            json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
Exemple #23
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].strip().replace(".shtml", "")

        type = TYPE

        category = None

        title = d('div.subject> h1').text().strip()

        tags = []

        post = newspost

        brief = d("meta[name='description']").attr("content")

        post_time = d('div.meta> span.meta-date').text().replace("发布", "")
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.subject> div.subject-content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    imgurl = parser_mysql_util.get_logo_id(
                        c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                        }
                    else:
                        continue

            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        collection_news.insert(dnews)
        mongo.close()
        logger.info("*************DONE*************")
    return
Exemple #24
0
def process_news(column, newsurl, content, newsposttime, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].strip()

        type = TYPE

        title = d('div.article-wrap> div.article-head> h1').text().strip()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return
        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        category = None
        categoryNames = []
        if "投资并购" in tags:
            category = 60101
            categoryNames.append("融资")

        # post = d('div#post_thumbnail> img').attr("src")
        post = None

        brief = d("meta[name='description']").attr("content")

        news_time = None
        if newsposttime is not None:
            news_time = extract.extracttime(newsposttime)
        if news_time is None:
            dt = datetime.date.today()
            post_time = d(
                'div.article-wrap> div.article-head> p> span.article-time'
            ).text()
            if post_time is None or post_time.strip() == str(dt):
                news_time = datetime.datetime.now()
                # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d")
            else:
                news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d")

        article = d('div.article-wrap> div.article-content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                (imgurl, width, height) = parser_mysql_util.get_logo_id_new(
                    c["data"], download_crawler, SOURCE, key, "news")
                if imgurl is not None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": str(imgurl),
                        "image_src": "",
                        "height": int(height),
                        "width": int(width)
                    }
                else:
                    continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        dnews["postId"] = post
        dnews["brief"] = brief

        # Design for sector:
        dnews["sectors"] = [10]
        dnews["sector_confidence"] = [1]

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Exemple #25
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        try:
            d = pq(html.fromstring(content.decode("utf-8",'ignore')))
        except:
            d = pq(html.fromstring(content))

        key = newsurl.split("/")[-1].replace(".shtml","")

        type = TYPE

        category = None

        title = d('div.main-content> div.title').text().strip()

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split():
                if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
                    tags.append(tag)

        # post = d('div#post_thumbnail> img').attr("src")
        post = None

        brief = d("meta[name='description']").attr("content")

        post_time = d('div.author> span.time').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()
        try:
            article = d('div.main-content').html()
        except:
            article = content
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("http://app.iheima.com/?app=member&controller=avatar") != -1 or \
                            c["data"] ==title or c["data"] == brief or c["data"].find(post_time) != -1 or \
                            c["data"].find("data:image/png;base64") != -1:
                continue

            if c["data"].find("未经授权,转载必究") != -1 or c["data"].find("赞(...)") != -1:
                break

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                if imgurl is not None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": str(imgurl),
                        "image_src": "",
                        "height": int(height),
                        "width": int(width)
                    }
                else:
                    continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Exemple #26
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        logger.info('here')

        d = pq(html.fromstring(content.decode("utf-8", 'ignore')))

        if d.text().find('embed') >= 0:  # 排除视频文章
            logger.info('not article:%s' % newsurl)
            return

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        title = d('h1').text().strip()

        if title is None or title == "":
            return

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({'title': title}) is not None:
            mongo.close()
            return

        try:
            (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                newspost, download_crawler, SOURCE, key, "news")
        except:
            posturl = None
        if posturl is not None:

            post = str(posturl)
        else:
            post = None

        tags = []
        articletags = d("meta[name='keywords']").attr('content')
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time_1 = d("div.wyt-post-content-meta> div> p ").find(
                'span').text().strip()
            post_time_2 = d("div.wyt-post-content-meta> div").find(
                'p').next().text().strip()
            if post_time_1:
                post_time = post_time_1
            else:
                post_time = post_time_2

            if re.match('\d{2}-\d{2}', post_time):  # 匹配 03-19格式
                post_time = str(time.localtime()[0]) + '-' + post_time

            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('article.wyt-post-content').html()

        contents = extract.extractContents(newsurl, article, document=True)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames,
        }
        dcontents = []
        rank = 1

        if contents[0]['type'] == 'img':
            del contents[0]

        for c in contents:
            # logger.info("%s-%s",c["type"],c["data"])
            if c['type'] == 'text':

                if re.match('^\d+$', c['data']) or c['data'].find('收藏') >= 0 or c['data'].find('投融资') >= 0 or c['data'].find('阅读时间') >= 0 \
                        or c['data'].find('违者必究') >= 0 or c['data'].find('微信公众号') >= 0 or c['data'].find('微信扫描') >= 0 \
                        or c['data'].find('点击获取完整版报告') >= 0 or c['data'].find('作者原创,微信号') >= 0:
                    continue

                # if c['data'].find('译者') >= 0:
                #     c['data'] = c['data'].split(' ')[0]
                #
                # if c['data'].find('来源') >= 0:
                #     c['data'] = c['data'].split('|')[0]

                if c['data'].find('| 未央网') >= 0:
                    c['data'] = c['data'].replace('| 未央网', ' ')

                dc = {
                    'rank': rank,
                    'content': c['data'],
                    'image': '',
                    'image_src': '',
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1

        dnews['contents'] = dcontents

        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        logger.info(
            json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
Exemple #27
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("gbk")))

        key = newsurl.split("/")[-1].replace(".htm", "")

        type = TYPE

        category = None
        categoryNames = []

        title = d('div.hd> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        # post = d('div#post_thumbnail> img').attr("src")
        # postraw = newspost
        # # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        # (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        # if posturl is not None:
        #     post = str(posturl)
        # else:
        #     post = None

        post = None
        brief = d("meta[name='description']").attr("content")

        post_time = d('div.a_Info> span.a_time').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.bd> div.Cnt-Main-Article-QQ').html()
        contents = extract.extractContents(newsurl, article)
        # logger.info(contents)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Exemple #28
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        # d = pq(html.fromstring(content.decode("gbk","ignore")))
        utfflag = False
        if content.find("gb2312") == -1:
            d = pq(html.fromstring(content.decode("utf-8", "ignore")))
            utfflag = True
        else:
            d = pq(html.fromstring(content.decode("gbk", "ignore")))
        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".shtml", "")

        type = TYPE

        post = None

        if utfflag is True:
            title = d('div#titsize> strong').text().strip()
        else:
            title = d('div.titmain> h1').text().strip()
            # logger.info("title: %s", title)
            if title is None or title.strip() == "":
                title = d('div.texttitbox> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)

        # try:
        #     brief = d('div.daodu> p').text().strip().replace("【数据猿导读】","")
        # except:
        #     brief = None
        brief = None

        try:
            if utfflag is True:
                post_time = d("p.time> span.mh-title").text().strip()
            else:
                post_time = d("meta[property='og:release_date']").attr(
                    "content").split("+")[0]

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if utfflag is True:
            article = d('div.tbox.content').html()
        else:
            article = d('div.texttit_m1').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=20),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
Exemple #29
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        title = d('div.da-title> h2').text().strip()
        if title.find("融资") >= 0:
            type = 60001
            category = 60101

        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time = d("span.article-time").eq(0).text().strip()

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.data-article').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("btm地址") >= 0 or \
                    c["data"].find("版权声明") >= 0:
                continue

            if c["data"].find("8btctest1/custom/images") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
Exemple #30
0
def process_news(column, newsurl, content):
    if has_news_content(content):
        d = pq(content)
        title = d('div#post_title').text()
        url = newsurl
        key = url.split('/')[-1]
        post_time = d('div#post_date').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        logger.info("title:%s, date:%s", title, news_time)
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": [],
            "processStatus": 0,
            # "companyId":companyId,
            "companyIds": [],
            "category": None,
            "domain": domain,
            "categoryNames": []
        }

        dcontents = []
        description = d('div#post_description').text()
        if description is not None:
            dc = {
                "rank": 1,
                "content": "亿欧快讯",
                "image": "",
                "image_src": "",
            }

            dcontents.append(dc)
            dc = {
                "rank": 2,
                "content": description.replace("【消息来源】", ""),
                "image": "",
                "image_src": "",
            }
            dcontents.append(dc)

            logger.info(description)

        dnews["contents"] = dcontents

        brief = util.get_brief_from_news(dcontents)

        post = util.get_posterId_from_news(dcontents)
        dnews["postId"] = post
        # dnews["post"] = post
        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        # collection_news.insert(dnews)
    return