コード例 #1
0
ファイル: vulcanpost_news.py プロジェクト: yujiye/Codes
def process_news(column,  content, msg, download_crawler):
    download_crawler = download.DownloadCrawler(use_proxy=False)
    d = pq(html.fromstring(content.decode("utf-8", "ignore")))

    title = msg['title']
    newsurl = msg['link']
    brief = msg['brief']
    newspost = msg['post']
    post_time = msg['newsDate']

    category = None
    categoryNames = []

    key = re.search('https://vulcanpost.com/(\d+)/.*',newsurl).group(1)
    type = TYPE

    (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
    if posturl is not None:
        post = str(posturl)
    else:
        post = None

    tags = []

    try:
        news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S")
        logger.info("news-time: %s", news_time)
    except Exception, e:
        logger.info(e)
        news_time = datetime.datetime.now()
コード例 #2
0
ファイル: parser_db_util.py プロジェクト: yujiye/Codes
def save_member(r, SOURCE, download_crawler):
    member_key, name, weibo, introduction, education, work, location, role, pictureUrl, company_key, position = r
    conn = db.connect_torndb()
    source_member = conn.get(
        "select * from source_member where source=%s and sourceId=%s order by id limit 1",
        SOURCE, member_key)
    logo_id = None
    if source_member == None or source_member[
            "photo"] == None or source_member["photo"] == "":
        if pictureUrl is not None and pictureUrl != "":
            # image_value = download_crawler.get_image(pictureUrl)
            # if image_value is not None:
            #     logo_id = imgfs.put(image_value, content_type='jpeg', filename='member_%s_%s.jpg' % (SOURCE, member_key))
            # logger.info("gridfs logo_id=%s" % logo_id)
            (logo_id, w,
             h) = parser_mysql_util.get_logo_id_new(pictureUrl,
                                                    download_crawler, SOURCE,
                                                    member_key, "member")
    else:
        logo_id = source_member["photo"]

    if source_member is None:
        sql = "insert source_member(name,photo,weibo,location,role,description,\
        education,work,source,sourceId,createTime,modifyTime,processStatus) \
        values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),0)"

        source_member_id = conn.insert(sql, name, logo_id, weibo, location,
                                       role, introduction, education, work,
                                       SOURCE, member_key)
    else:
        source_member_id = source_member["id"]
        sql = "update source_member set name=%s,photo=%s,weibo=%s,location=%s,role=%s,description=%s,\
        education=%s,work=%s,modifyTime=now(),processStatus=0 where id=%s"

        conn.update(sql, name, logo_id, weibo, location, role, introduction,
                    education, work, source_member_id)

    if company_key is not None:
        source_company = conn.get(
            "select * from source_company where source=%s and sourceId=%s",
            SOURCE, company_key)
        if source_company is not None:
            source_company_id = source_company["id"]
            source_company_member_rel = conn.get(
                "select * from source_company_member_rel where \
                    sourceCompanyId=%s and sourceMemberId=%s",
                source_company_id, source_member_id)
            if source_company_member_rel is None:
                type = name_helper.position_check(position)
                logger.info("position %s, type %s", position, type)
                conn.insert(
                    "insert source_company_member_rel(sourceCompanyId, sourceMemberId, \
                            position,type,createTime,modifyTime) \
                            values(%s,%s,%s,%s, now(),now())",
                    source_company_id, source_member_id, position, type)
    conn.close()
コード例 #3
0
ファイル: parser_db_util.py プロジェクト: yujiye/Codes
def save_company_standard(source_company, download_crawler):
    conn = db.connect_torndb()
    s = source_company
    result = conn.get(
        "select * from source_company where source=%s and sourceId=%s order by id limit 1",
        s["source"], str(s["sourceId"]))

    logo_id = None
    if result is None or result["logo"] is None or result["logo"] == "":
        log_url = s["logo"]
        if log_url is not None and len(log_url.strip()) > 0:
            # logger.info(log_url)
            # image_value = download_crawler.get_image(log_url)
            # if image_value is not None:
            #     logo_id = imgfs.put(image_value, content_type='jpeg', filename='company_%s_%s.jpg' % (s["source"], s["sourceId"]))
            (logo_id, w,
             h) = parser_mysql_util.get_logo_id_new(log_url, download_crawler,
                                                    s["source"], s["sourceId"],
                                                    "company")
    else:
        logo_id = result["logo"]
    logger.info("gridfs logo_id=%s" % logo_id)

    s["logo"] = logo_id

    if result is not None:
        source_company_id = result["id"]
        s["id"] = source_company_id
        update_source_company(s)
    else:
        sql = "insert source_company(name,fullName,description,brief,round, \
              productDesc, modelDesc, operationDesc, teamDesc, marketDesc, compititorDesc, advantageDesc, planDesc, \
              roundDesc,companyStatus,fundingType,locationId, address, \
              phone, establishDate, logo,source,sourceId, \
              createTime,modifyTime, \
              field,subField,tags, headCountMin, headCountMax,processStatus) \
              values \
              (%s,%s,%s,%s,%s, \
              %s,%s,%s,%s,%s,%s,%s,%s, \
              %s,%s,%s,%s,%s, \
              %s,%s,%s, %s, %s, \
              now(),now(), \
              %s,%s,%s, %s, %s,0)"

        source_company_id = conn.insert(
            sql, s["name"],
            s["fullName"], s["description"], s["brief"], s["round"],
            s.get("productDesc"), s.get("modelDesc"), s.get("operationDesc"),
            s.get("teamDesc"), s.get("marketDesc"), s.get("compititorDesc"),
            s.get("advantageDesc"), s.get("planDesc"), s["roundDesc"],
            s["companyStatus"], s["fundingType"], s["locationId"],
            s["address"], s["phone"], s["establishDate"], s["logo"],
            s["source"], s["sourceId"], s["field"], s["subField"], s["tags"],
            s["headCountMin"], s["headCountMax"])
    conn.close()
    return source_company_id
コード例 #4
0
ファイル: parser_db_util.py プロジェクト: yujiye/Codes
def save_blockchain_standard_feixiaohao(source_feixiaohao, download_crawler):
    s = source_feixiaohao

    logo_url = s["logo"]
    conn = db.connect_torndb()

    source = None
    if s["name"] is not None and s["name"].strip() != "":
        source = conn.get(
            "select * from digital_token where symbol=%s and name=%s",
            s["symbol"], s["name"])

    elif s["enname"] is not None and s["enname"].strip() != "":
        source = conn.get(
            "select * from digital_token where symbol=%s and enname=%s",
            s["symbol"], s["enname"])

    else:
        source = conn.get("select * from digital_token where symbol=%s",
                          s["symbol"])

    logo_id = None
    if source is None or source["logo"] is None or source["logo"] == "":
        if logo_url is not None and len(logo_url.strip()) > 0:
            (logo_id, w,
             h) = parser_mysql_util.get_logo_id_new(logo_url, download_crawler,
                                                    13511, s["symbol"],
                                                    "blockchain")
    else:
        logo_id = source["logo"]

    if source is None:
        sql = "insert digital_token(" \
              "companyId,symbol,name,enname,publishDate," \
              "websites,browsers,description,whitepaper,logo,createTime," \
              "modifyTime)" \
              " values" \
              "(%s,%s,%s,%s,%s," \
              "%s,%s,%s,%s,%s, now()," \
              "now())"
        source_d_id = conn.insert(sql, s["companyId"], s["symbol"], s["name"],
                                  s["enname"], s["publishDate"], s["websites"],
                                  s["browsers"], s["description"],
                                  s["whitepaper"], logo_id)

    else:
        source_d_id = source["id"]
        sql = "update digital_token set name=%s,enname=%s,publishDate=%s," \
              "websites=%s,browsers=%s,description=%s,whitepaper=%s,logo=%s,modifyTime=now() where id=%s"
        conn.update(sql, s["name"], s["enname"], s["publishDate"],
                    s["websites"], s["browsers"], s["description"],
                    s["whitepaper"], logo_id, source_d_id)

    conn.close()
    return source_d_id
コード例 #5
0
ファイル: parser_db_util.py プロジェクト: yujiye/Codes
def save_company(r, SOURCE, download_crawler):
    company_key = r["sourceId"]
    conn = db.connect_torndb()

    logo_id = None
    source_company = conn.get(
        "select * from source_company where source=%s and sourceId=%s", SOURCE,
        str(company_key))
    if source_company is None or source_company[
            "logo"] is None or source_company["logo"] == "":
        log_url = r["logo"]
        if log_url is not None and len(log_url.strip()) > 0:
            logger.info(log_url)
            # image_value = download_crawler.get_image(log_url)
            # if image_value is not None:
            #     logo_id = imgfs.put(image_value, content_type='jpeg', filename='company_%s_%s.jpg' % (SOURCE, company_key))
            (logo_id, w,
             h) = parser_mysql_util.get_logo_id_new(log_url, download_crawler,
                                                    SOURCE, company_key,
                                                    "company")
    else:
        logo_id = source_company["logo"]
    logger.info("gridfs logo_id=%s" % logo_id)

    if source_company == None:
        source_company_id = conn.insert(
            "insert source_company(name,fullName,description,brief,\
                    round,roundDesc,companyStatus,fundingType,locationId,establishDate,logo,\
                    source,sourceId,createTime,modifyTime,\
                    field,subField,tags,type,processStatus) \
                    values(%s,%s,%s,%s,\
                    %s,%s,%s,%s,%s,%s,%s,\
                    %s,%s,now(),now(),\
                    %s,%s,%s,%s,0)", r["productName"], r["fullName"],
            r["description"], r["brief"], r["round"], r["roundDesc"],
            r["companyStatus"], r["fundingType"], r["locationId"],
            r["establishDate"], logo_id, SOURCE, company_key, r["field"],
            r["subField"], r["tags"], r["type"])
    else:
        source_company_id = source_company["id"]
        conn.update(
            "update source_company set \
                    name=%s,fullName=%s,description=%s, brief=%s, \
                    round=%s,roundDesc=%s,companyStatus=%s,fundingType=%s,locationId=%s,establishDate=%s,logo=%s, \
                    field=%s,subField=%s,tags=%s,type=%s, \
                    modifyTime=now(),processStatus=0,active=null \
                    where id=%s", r["productName"], r["fullName"],
            r["description"], r["brief"], r["round"], r["roundDesc"],
            r["companyStatus"], r["fundingType"], r["locationId"],
            r["establishDate"], logo_id, r["field"], r["subField"], r["tags"],
            r["type"], source_company_id)
    conn.close()

    return source_company_id
コード例 #6
0
ファイル: parser_db_util.py プロジェクト: yujiye/Codes
def save_investor_standard_new(source_investor, download_crawler):
    s = source_investor
    source = s["source"]
    sourceId = s["sourceId"]
    logo_url = s["logo"]
    conn = db.connect_torndb()
    source_investor = conn.get(
        "select * from source_investor where source=%s and sourceId=%s",
        source, sourceId)

    logo_id = None
    if source_investor is None or source_investor[
            "logo"] is None or source_investor["logo"] == "":
        if logo_url is not None and len(logo_url.strip()) > 0:
            (logo_id, w,
             h) = parser_mysql_util.get_logo_id_new(logo_url, download_crawler,
                                                    source, sourceId,
                                                    "investor")
    else:
        logo_id = source_investor["logo"]

    if source_investor is None:
        sql = "insert source_investor(" \
              "name,website,description,logo,stage," \
              "field,type, source,sourceId,createTime," \
              "modifyTime,processStatus,wechatId,weibo,enName,fullName,enFullName,establishDate)" \
              " values" \
              "(%s,%s,%s,%s,%s," \
              "%s,%s,%s,%s,now()," \
              "now(),0,%s,%s,%s,%s,%s,%s)"
        source_investor_id = conn.insert(sql, s["name"], s["website"],
                                         s["description"], logo_id,
                                         s.get("stage"), s.get("field"),
                                         s.get("type"), source, sourceId,
                                         s.get("wechatId"), s.get("weibo"),
                                         s.get("enName"), s.get("fullName"),
                                         s.get("enFullName"),
                                         s.get("establishDate"))
    else:
        source_investor_id = source_investor["id"]
        sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s," \
              "field=%s,type=%s,wechatId=%s,weibo=%s,enName=%s,fullName=%s," \
              "enFullName=%s,establishDate=%s, modifyTime=now(),processStatus=0 where id=%s"
        conn.update(sql, s["name"], s["website"], s["description"], logo_id,
                    s.get("stage"), s.get("field"), s.get("type"),
                    s.get("wechatId"), s.get("weibo"), s.get("enName"),
                    s.get("fullName"), s.get("enFullName"),
                    s.get("establishDate"), source_investor_id)

    conn.close()
    return source_investor_id
コード例 #7
0
ファイル: parser_db_util.py プロジェクト: yujiye/Codes
def save_investor_standard(source_investor, download_crawler):
    s = source_investor
    source = s["source"]
    sourceId = s["sourceId"]
    logo_url = s["logo_url"]
    conn = db.connect_torndb()
    source_investor = conn.get(
        "select * from source_investor where source=%s and sourceId=%s",
        source, str(sourceId))

    logo_id = None
    if source_investor == None or source_investor[
            "logo"] == None or source_investor["logo"] == "":
        if logo_url is not None and len(logo_url.strip()) > 0:
            # image_value = download_crawler.get_image(logo_url)
            # if image_value is not None:
            #     logo_id = imgfs.put(image_value, content_type='jpeg', filename='investor_%s_%s.jpg' % (source, sourceId))
            #     logger.info("gridfs logo_id=%s" % logo_id)
            (logo_id, w,
             h) = parser_mysql_util.get_logo_id_new(logo_url, download_crawler,
                                                    source, sourceId,
                                                    "investor")
    else:
        logo_id = source_investor["logo"]
        #logger.info("logo_id=%s" % logo_id)

    if source_investor is None:
        sql = "insert source_investor(" \
              "name,website,description,logo,stage," \
              "field,type, source,sourceId,createTime," \
              "modifyTime,processStatus)" \
              " values" \
              "(%s,%s,%s,%s,%s," \
              "%s,%s,%s,%s,now()," \
              "now(),0)"
        source_investor_id = conn.insert(sql, s["name"], s["website"],
                                         s["description"], logo_id, s["stage"],
                                         s["field"], s["type"], source,
                                         sourceId)
    else:
        source_investor_id = source_investor["id"]
        sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s,\
        field=%s,type=%s,modifyTime=now(),processStatus=0 where id=%s"

        conn.update(sql, s["name"], s["website"], s["description"], logo_id,
                    s["stage"], s["field"], s["type"], source_investor_id)

    conn.close()
    return source_investor_id
コード例 #8
0
ファイル: parser_db_util.py プロジェクト: yujiye/Codes
def save_investor_member_standard(source_investor_id, members,
                                  download_crawler):
    conn = db.connect_torndb()
    conn.execute(
        "delete from source_investor_member where sourceInvestorId=%s",
        source_investor_id)
    for s in members:
        (logo_id, w,
         h) = parser_mysql_util.get_logo_id_new(s["logo"], download_crawler,
                                                s["source"], s["sourceId"],
                                                "member")
        sql = "insert source_investor_member(sourceInvestorId, investorMemberId,source,sourceId,name,logo, position, description, createTime,modifyTime) \
                      values(%s,%s,%s,%s,%s,%s,%s,%s,now(),now())"

        conn.insert(sql, source_investor_id, None, s["source"], s["sourceId"],
                    s["name"], logo_id, s["position"], s["description"])
    conn.close()
コード例 #9
0
ファイル: techinasia_news.py プロジェクト: yujiye/Codes
def process_news(content, download_crawler):
    download_crawler = download.DownloadCrawler(use_proxy=False)

    category = None
    categoryNames = []

    key = content['id']
    type = TYPE
    title = content['title']

    mongo = db.connect_mongo()
    collection_news = mongo.article.news
    if collection_news.find_one({"title": title}) is not None:
        mongo.close()
        return
    newspost = content.get('featured_image').get('source')
    (posturl, width,
     height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler,
                                                 SOURCE, key, "news")
    if posturl is not None:
        post = str(posturl)
    else:
        post = None
    # logger.info(post)

    tags = []
    for tag in content['tags']:
        tags.append(tag['name'])
    brief = content['seo']['description']
    try:
        post_time = content['modified_gmt']
        news_time = None
        if post_time.find('T'):
            post_time = post_time.replace('T', ' ')
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
    except Exception, e:
        logger.info(e)
        news_time = datetime.datetime.now()
コード例 #10
0
ファイル: parser_db_util.py プロジェクト: yujiye/Codes
def save_investfirm(r, SOURCE, download_crawler):
    investor_key, investor_name, logo, website, stageStr, fieldsStr, desc = r
    conn = db.connect_torndb()
    source_investor = conn.get(
        "select * from source_investor where source=%s and sourceId=%s",
        SOURCE, str(investor_key))
    #logger.info(source_investor["logo"])
    logo_id = None
    if source_investor == None or source_investor[
            "logo"] == None or source_investor["logo"] == "":
        if logo is not None and logo != "":
            # image_value = download_crawler.get_image(logo)
            # if image_value is not None:
            #     logo_id = imgfs.put(image_value, content_type='jpeg', filename='investor_%s_%s.jpg' % (SOURCE, investor_key))
            #     logger.info("gridfs logo_id=%s" % logo_id)
            (logo_id, w,
             h) = parser_mysql_util.get_logo_id_new(logo, download_crawler,
                                                    SOURCE, investor_key,
                                                    "investor")
    else:
        logo_id = source_investor["logo"]

    if source_investor is None:
        sql = "insert source_investor(name,website,description,logo,stage,field,type, \
        source,sourceId,createTime,modifyTime,processStatus) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),0)"

        source_investor_id = conn.insert(sql, investor_name, website, desc,
                                         logo_id, stageStr, fieldsStr, 10020,
                                         SOURCE, investor_key)
    else:
        source_investor_id = source_investor["id"]
        sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s,\
        field=%s,type=%s,modifyTime=now(),processStatus=0 where id=%s"

        conn.update(sql, investor_name, website, desc, logo_id, stageStr,
                    fieldsStr, 10020, source_investor_id)

    conn.close()
コード例 #11
0
ファイル: scmp_news.py プロジェクト: yujiye/Codes
         dc = {
             "rank": rank,
             "content": c["data"],
             "image": "",
             "image_src": "",
         }
     else:
         if download_crawler is None:
             dc = {
                 "rank": rank,
                 "content": "",
                 "image": "",
                 "image_src": c["data"],
             }
         else:
             (imgurl, width, height) = parser_mysql_util.get_logo_id_new(
                 c["data"], download_crawler, SOURCE, key, "news")
             if imgurl is not None:
                 dc = {
                     "rank": rank,
                     "content": "",
                     "image": str(imgurl),
                     "image_src": "",
                     "height": int(height),
                     "width": int(width)
                 }
             else:
                 continue
     # logger.info(c["data"])
     dcontents.append(dc)
     rank += 1
 dnews["contents"] = dcontents
コード例 #12
0
def process_news(content, url):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))
        download_crawler = download.DownloadCrawler(use_proxy=False)
        title = d(
            'div.post-img-left> div> div.post-head> h1.title').text().strip()
        post_time = d('article.post-article').attr("ptime")
        post_Date = time.localtime(int(post_time))
        news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon,
                                      post_Date.tm_mday, post_Date.tm_hour,
                                      post_Date.tm_min, post_Date.tm_sec)

        if collection_news.find_one({"link": url}) is not None:
            return
            # collection_news.delete_one({"link": url})

        if collection_news.find_one({
                "title": title,
                "source": {
                    "$ne": SOURCE
                }
        }) is not None:
            return

        key = d('article.post-article').attr("postid")
        try:
            key_int = int(key)
        except:
            key_int = None
        column = d('span.post-category').text().strip()
        brief = d("meta[name='description']").attr("content").strip()

        if column is not None:
            tags = column.split()
        else:
            tags = []

        categoryNames = []
        if "人物" in tags:
            category = 60103
        elif "公司" in tags:
            category = 60105
            categoryNames.append("大公司")
        else:
            category = None

        keywords = d("meta[name='keywords']").attr("content")
        if keywords is not None:
            for keyword in keywords.split(","):
                if keyword is not None and keyword.strip(
                ) not in tags and keyword.strip() not in ["PingWest", "品玩"]:
                    tags.append(keyword.strip())

        postraw = d("link[rel='image_src']").attr("href")
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time,
                    news_time, brief, ":".join(tags), category, post)
        article = d('div.box-con> div#sc-container').html()
        # logger.info(article)
        contents = extract.extractContents(url, article)

        # if collection_news.find_one({"link": url}) is not None:
        #     return
        #     # collection_news.delete_one({"link": url})
        #
        # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
        #     return
        # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=16),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": key_int,
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []

        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""),
                # }
                if download_crawler is None:
                    dc = {
                        "rank":
                        rank,
                        "content":
                        "",
                        "image":
                        "",
                        "image_src":
                        c["data"].replace("?imageView2/2/w/750/q/90", ""),
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"].replace("?imageView2/2/w/750/q/90", ""),
                         download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
コード例 #13
0
ファイル: cnstock_news.py プロジェクト: yujiye/Codes
def process_news(column, newsurl, content, newspost, download_crawler):
    # if has_news_content(content):
    if 1:
        # logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode('gb2312', 'ignore')))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".htm", "")

        title = d('h1.title').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)

        brief = None

        news_time = d('.timer').text()
        news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M:%S')

        article = d('.content').html()
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            # "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        mongo.close()

        if news_classify.get_class(dcontents, 13866) == 1:
            logger.info('%s is fundingNews', title)
            TYPE = 60001
        else:
            TYPE = 60010
            logger.info('%s is not fundingNews', title)

        dnews['type'] = TYPE

        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
    return
コード例 #14
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("gbk")))

        key = newsurl.split("/")[-1].replace(".htm", "")

        type = TYPE

        category = None
        categoryNames = []

        title = d('div.hd> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        # post = d('div#post_thumbnail> img').attr("src")
        # postraw = newspost
        # # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        # (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        # if posturl is not None:
        #     post = str(posturl)
        # else:
        #     post = None

        post = None
        brief = d("meta[name='description']").attr("content")

        post_time = d('div.a_Info> span.a_time').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.bd> div.Cnt-Main-Article-QQ').html()
        contents = extract.extractContents(newsurl, article)
        # logger.info(contents)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
コード例 #15
0
def process_news(item, url, content):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("gbk")))

        title = d(
            'div.g-main> div> div.m-cont-hd> div.title> h1').text().strip()
        datecontent = d(
            'div.g-main> div> div.m-cont-hd> div.m-info> div> div> div.box> div.origin'
        ).text().strip()
        result = util.re_get_result('(\d{4}\/.*?)$', datecontent)
        if result:
            post_time, = result
            news_time = datetime.datetime.strptime(post_time,
                                                   "%Y/%m/%d %H:%M:%S")
        else:
            post_time = None
            news_time = None

        key = item["key"]
        column = d('div.g-main> div> div.m-cont-hd> div.tag').text().strip()
        brief = d('div.g-article> div> div.review').text().strip()
        postraw = item["post"]
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        if column is not None:
            tags = column.split()
        else:
            tags = []

        logger.info("%s, %s, %s, %s, %s, %s", key, title, post_time, news_time,
                    brief, ":".join(tags))
        article = d('div.g-article> div.m-article').html()
        #logger.info(article)
        contents = extract.extractContents(url, article)

        if collection_news.find_one({"link": url}) is not None:
            return
            # collection_news.delete_one({"link": url})
        #
        # for t in contents:
        #    logger.info(t["data"])
        #    logger.info("")
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": None,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_posterId_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        # collection_news.insert(dnews)
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
コード例 #16
0
ファイル: ne_yimin_news.py プロジェクト: yujiye/Codes
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].strip().replace(".html", "").replace(
            'detail_', '')

        type = TYPE

        category = None

        title = d('div.left.zb-n> h1').text().strip()

        tags = []

        postraw = newspost
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        # brief = d("meta[name='description']").attr("content").replace(u'一鸣网——让发生的发声|智慧共享新媒体平台|上海TMT媒体开创者、一鸣网ymtmt.com','')
        brief = d('div.left.zb-n> p.gy').text().strip()
        news_time = datetime.datetime.now()

        article = d('div.left.zb-n').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            logger.info('already exists %s', title)
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        start = False
        for c in contents:
            if start is False and c["data"].find(
                    brief) >= 0 and c["data"].find(title) >= 0:
                start = True
                continue
            if start is False:
                continue

            if c["data"].find("-END-") >= 0:
                break
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        mongo.close()
        # logger.info("*************DONE*************")
    else:
        logger.info('has no news content %s', newsurl)
    return
コード例 #17
0
ファイル: chuangyebang_news.py プロジェクト: yujiye/Codes
def process_news(content, news_key, url):
    if has_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content))
        brief = d("meta[name='description']").attr("content").split(",")[-1]
        title = d('div#article> div.single-item> div.article-hd> h1').text().strip()
        pagetitle = d('head> title').text().strip()
        temp = pagetitle.split("-")[-2]
        categoryNames = []
        if temp.strip() == "初页":
            category = 60102
            categoryNames.append("产品")
        elif temp.strip() == 'IPO/并购':
            category = 60105
            categoryNames.append("大公司")
        else:
            category = None
        post_time = d('div.author-time> span.date-time').attr("data-time")
        post_date = time.localtime(int(post_time))
        news_time = datetime.datetime(post_date.tm_year, post_date.tm_mon, post_date.tm_mday, post_date.tm_hour,
                                      post_date.tm_min, post_date.tm_sec)
        key = news_key
        column = d('div.article-tags> a').text()
        tags = column.split()
        logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, post_time, news_time, temp,  category, ":".join(tags))
        article = d('div#article> div> div.article-content').html()
        # # logger.info(article)
        contents = extract.extractContents(url, article)

        if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None:
            return
            # collection_news.delete_one({"source": SOURCE, "key_int": int(key)})

        if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
            return
            # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})

        # for t in contents:
        #    logger.info(t["data"])
        #    logger.info("")
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "" or desc_helper.check_desc(brief,2) is False:
            brief = util.get_brief_from_news(dcontents)
        # post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post

        post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
コード例 #18
0
ファイル: forbes_news.py プロジェクト: yujiye/Codes
def process_news(column, j_content, content, download_crawler):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = j_content['id']
        type = TYPE
        title = j_content['title']

        newspost = j_content.get('image')
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        brief = j_content['description']
        newsurl = j_content['uri']

        try:
            date = j_content['date']
            post_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(str(date)[:-3])))
            news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") - datetime.timedelta(days=1)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        article = d('div.article-container').html()
        contents = extract.extractContents(newsurl, article,document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief)
        flag, domain = url_helper.get_domain(newsurl)

        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                if c["data"].find("Share to facebookShare to twitterShare to linkedin") >= 0:
                    c['data'] = c['data'].replace('Share to facebookShare to twitterShare to linkedin', '')
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
コード例 #19
0
ファイル: chinav_news.py プロジェクト: yujiye/Codes
def process_news(column, newsurl, content, newspost):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        key = content["news"]["id"]

        newsurl = "https://www.chinaventure.com.cn/cmsmodel/news/detail/%s.shtml" % key

        type = TYPE

        category = None
        categoryNames = []
        if content["news"].has_key("newsChannelId"):
            if content["news"]["newsChannelId"] == 52:
                category = 60101
                categoryNames.append("融资")

        if content["news"].has_key("tagName"):
            if content["news"]["tagName"] == '人物':
                category = 60103

        tags = []
        if content.has_key("keywordList") is True and len(
                content["keywordList"]) > 0:
            for tag in content["keywordList"]:
                if tag.has_key("keyword") and tag[
                        "keyword"] is not None and tag["keyword"].strip(
                        ) != "" and tag["keyword"] not in tags:
                    tags.append(tag["keyword"])

        title = content["news"]["title"].replace(""", "\"")

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            logger.info(
                "***************************News existed!!!***********************"
            )
            mongo.close()
            return

        # post = d('div#post_thumbnail> img').attr("src")
        postraw = "http://pic.chinaventure.com.cn/" + content["news"][
            "coverImg"]
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = content["news"]["introduction"]

        post_time = content["news"]["updateAt"]

        news_time = extract.extracttime(str(post_time))
        if news_time is None:
            news_time = datetime.datetime.now()

        article = pq(content["news"]["content"]).html()
        contents = extract.extractContents(newsurl, article)
        # for c in contents:
        #     logger.info(c["data"])
        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, post)
        # return
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     logger.info("***************************News existed!!!***********************")
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("img.mp.itc.cn") >= 0:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        # dnews["post"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        # logger.info("*************DONE*************")
    return
コード例 #20
0
def process_news(column, newsurl, content, newsposttime, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].strip()

        type = TYPE

        title = d('div.article-wrap> div.article-head> h1').text().strip()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return
        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        category = None
        categoryNames = []
        if "投资并购" in tags:
            category = 60101
            categoryNames.append("融资")

        # post = d('div#post_thumbnail> img').attr("src")
        post = None

        brief = d("meta[name='description']").attr("content")

        news_time = None
        if newsposttime is not None:
            news_time = extract.extracttime(newsposttime)
        if news_time is None:
            dt = datetime.date.today()
            post_time = d(
                'div.article-wrap> div.article-head> p> span.article-time'
            ).text()
            if post_time is None or post_time.strip() == str(dt):
                news_time = datetime.datetime.now()
                # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d")
            else:
                news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d")

        article = d('div.article-wrap> div.article-content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                (imgurl, width, height) = parser_mysql_util.get_logo_id_new(
                    c["data"], download_crawler, SOURCE, key, "news")
                if imgurl is not None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": str(imgurl),
                        "image_src": "",
                        "height": int(height),
                        "width": int(width)
                    }
                else:
                    continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        dnews["postId"] = post
        dnews["brief"] = brief

        # Design for sector:
        dnews["sectors"] = [10]
        dnews["sector_confidence"] = [1]

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
コード例 #21
0
ファイル: welian_activity.py プロジェクト: yujiye/Codes
def process(content, citykey, crawler):
    cnt = 0
    if has_content(content):
        DT = datetime.date.today()
        TODAY = datetime.datetime(DT.year, DT.month, DT.day)
        #logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8")))
        lis =  d('div.wrap> div> div> ul.ativities> li.item')
        for li in lis:
            c = pq(li)
            img = c('a> img').attr("src").strip().replace("|130w","")
            if img is not None:
                # logger.info("poster: %s", poster)
                # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
                (posturl, width, height) = parser_mysql_util.get_logo_id_new(img, download_crawler, SOURCE, key,
                                                                             "news")
                if posturl is not None:
                    poster = str(posturl)
                else:
                    poster = None
            title = c('h3.title> a').text()
            link = c('h3.title> a').attr("href")
            if link.find("http") ==-1:
                 continue
            key = link.split("/")[-1]
            key_int = int(key)
            location = c('div.intro> div.address').text()
            sponors = c('div.intro> div.sponors> span').text().replace(","," ").replace(","," ").split()
            spans = c('div.intro> div.time> span')
            if len(spans) == 3:
                date = c('div.intro> div.time> span').eq(0).text()
                times = c('div.intro> div.time> span').eq(2).text().split("~")
                beginTime = date+" "+times[0]
                endTime = date+" "+times[1]
            elif len(spans) == 5:
                date = c('div.intro> div.time> span').eq(0).text()
                year = date.split("-")[0]
                times = c('div.intro> div.time> span').eq(2).text().split("~")
                beginTime = date+" "+times[0]
                endTime = year+"-"+times[1]+" "+c('div.intro> div.time> span').eq(4).text()
            else:
                continue
            try:
                beginDate = datetime.datetime.strptime(beginTime, "%Y-%m-%d %H:%M")
                endDate = datetime.datetime.strptime(endTime, "%Y-%m-%d %H:%M")
            except:
                beginDate = None

            if beginDate is None or beginDate < TODAY or key_int is None:
                # Not save active activity
                continue

            result = crawler.crawl(link)
            while True:
                if result['get'] == 'success':
                    break
                else:
                    result = crawler.crawl(link)
            if has_content(result['content']):
                contents = extract.extractContents(link, result['content'])
                flag, domain = url_helper.get_domain(link)
                dact = {
                    "beginDate": beginDate - datetime.timedelta(hours=8),
                    "endDate": endDate - datetime.timedelta(hours=8),
                    "date": beginDate - datetime.timedelta(hours=8),
                    "title": title,
                    "link": link,
                    "createTime": datetime.datetime.now(),
                    "source": SOURCE,
                    "key": key,
                    "key_int": key_int,
                    "type": TYPE,
                    "original_tags": [],
                    "processStatus": 0,
                    "companyIds": [],
                    "location": location,
                    "city": citymap[citykey],
                    "sponors": sponors,
                    "post": poster,
                    "domain": domain,
                    "categoryNames": []
                }
                dcontents = []
                rank = 1
                for c in contents:
                    if c["type"] == "text":
                        if c["data"].find("我要报名") >= 0:
                            logger.info("************************over")
                            break
                        dc = {
                            "rank": rank,
                            "content": c["data"],
                            "image": "",
                            "image_src": "",
                        }
                    else:
                        # dc = {
                        #     "rank": rank,
                        #     "content": "",
                        #     "image": "",
                        #     "image_src": c["data"],
                        # }
                        (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"],download_crawler, SOURCE,
                                                                                    key, "news")
                        if imgurl is not None:
                            dc = {
                                "rank": rank,
                                "content": "",
                                "image": str(imgurl),
                                "image_src": "",
                                "height": int(height),
                                "width": int(width)
                            }
                        else:
                            continue
                    dcontents.append(dc)
                    rank += 1
                dact["contents"] = dcontents
                value = activity_simhash.get_simhash_value(dcontents)
                dact["simhashValue"] = value

                record = collection_news.find_one({"source": SOURCE, "key_int": key_int})
                if record is not None:
                    city = record["city"]
                    if record["beginDate"] == dact["beginDate"] and record["endDate"] == dact["endDate"] and record["title"] == dact["title"] and record["city"] == citymap[citykey] and record["location"] == dact["location"]:
                        logger.info("%s activity already existed", title)
                        cnt += 1
                        continue
                    else:
                        collection_news.delete_one({"source": SOURCE, "key_int": key_int})
                        if city != citymap[citykey]:
                            logger.info("%s has two city : %s and %s with location %s, something is wrong", title, city, citymap[citykey], location)
                            cnt += 1
                            continue

                        collection_news.insert(dact)
                        logger.info("%s, %s, %s->%s, %s, %s, %s, %s", key, title, beginDate, endDate, ":".join(sponors),location, link, img)
                else:
                    if activity_simhash.check_same_act(dact) is True:
                        pass
                    else:
                        collection_news.insert(dact)
                        logger.info("%s, %s, %s->%s, %s, %s, %s, %s", key, title, beginDate, endDate, ":".join(sponors), location, link, img)
                cnt += 1
                logger.info("************Done***************")
    logger.info("*******%s activities has been checked or recorded", cnt)
    return cnt
コード例 #22
0
ファイル: weiyang_news.py プロジェクト: yujiye/Codes
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        logger.info('here')

        d = pq(html.fromstring(content.decode("utf-8", 'ignore')))

        if d.text().find('embed') >= 0:  # 排除视频文章
            logger.info('not article:%s' % newsurl)
            return

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        title = d('h1').text().strip()

        if title is None or title == "":
            return

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({'title': title}) is not None:
            mongo.close()
            return

        try:
            (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                newspost, download_crawler, SOURCE, key, "news")
        except:
            posturl = None
        if posturl is not None:

            post = str(posturl)
        else:
            post = None

        tags = []
        articletags = d("meta[name='keywords']").attr('content')
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time_1 = d("div.wyt-post-content-meta> div> p ").find(
                'span').text().strip()
            post_time_2 = d("div.wyt-post-content-meta> div").find(
                'p').next().text().strip()
            if post_time_1:
                post_time = post_time_1
            else:
                post_time = post_time_2

            if re.match('\d{2}-\d{2}', post_time):  # 匹配 03-19格式
                post_time = str(time.localtime()[0]) + '-' + post_time

            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('article.wyt-post-content').html()

        contents = extract.extractContents(newsurl, article, document=True)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames,
        }
        dcontents = []
        rank = 1

        if contents[0]['type'] == 'img':
            del contents[0]

        for c in contents:
            # logger.info("%s-%s",c["type"],c["data"])
            if c['type'] == 'text':

                if re.match('^\d+$', c['data']) or c['data'].find('收藏') >= 0 or c['data'].find('投融资') >= 0 or c['data'].find('阅读时间') >= 0 \
                        or c['data'].find('违者必究') >= 0 or c['data'].find('微信公众号') >= 0 or c['data'].find('微信扫描') >= 0 \
                        or c['data'].find('点击获取完整版报告') >= 0 or c['data'].find('作者原创,微信号') >= 0:
                    continue

                # if c['data'].find('译者') >= 0:
                #     c['data'] = c['data'].split(' ')[0]
                #
                # if c['data'].find('来源') >= 0:
                #     c['data'] = c['data'].split('|')[0]

                if c['data'].find('| 未央网') >= 0:
                    c['data'] = c['data'].replace('| 未央网', ' ')

                dc = {
                    'rank': rank,
                    'content': c['data'],
                    'image': '',
                    'image_src': '',
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1

        dnews['contents'] = dcontents

        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        logger.info(
            json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
コード例 #23
0
ファイル: vcbeat_news.py プロジェクト: yujiye/Codes
def process_news(column, newsurl, content, newspost, download_crawler):
    logger.info('starting process_news %s', newsurl)
    # if has_news_content(content):
    if 1:
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1]

        # type = TYPE

        category = None

        title = d('.article_title p').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            logger.info('title:%s already exists' % title)
            return

        tags = []
        articletags = d(".labs a").text().strip()
        if articletags is not None:
            for tag in articletags.split():
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        postraw = newspost
        # post = d('div#post_thumbnail> img').attr("src")
        # if post is not None:
        #     post = "http://vcbeat.com"+ post

        brief = None
        # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip()

        # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content)
        news_time = d('.time').text().strip()
        news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M')
        # dt = datetime.date.today()
        # today = datetime.datetime.now()
        # if news_time is None or news_time > today:
        #     news_time = datetime.datetime.now()

        article = d('.art_text').html()
        contents = extract.extractContents(newsurl, article, document=False)
        # if len(contents)==0:
        #     contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, postraw)
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     logger.info( 'title:%s already exists'%title)
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": TYPE,
            "original_tags": None,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": None,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        dnews["brief"] = brief

        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        # update link content with oldId
        item = collection_news.find_one({"link": newsurl})
        if item is None:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
        else:
            logger.info("update %s", newsurl)
            # collection_news.update_many({'link': newsurl},{'$set': dnews})

            # oldId = collection_news.find_one({"link": newsurl})['_id']
            # collection_news.delete_one({"link": newsurl})
            # dnews['_id']=oldId
            # collection_news.insert(dnews)
        mongo.close()
        logger.info("*************DONE*************")

    return
コード例 #24
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        # d = pq(html.fromstring(content.decode("gbk","ignore")))
        utfflag = False
        if content.find("gb2312") == -1:
            d = pq(html.fromstring(content.decode("utf-8", "ignore")))
            utfflag = True
        else:
            d = pq(html.fromstring(content.decode("gbk", "ignore")))
        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".shtml", "")

        type = TYPE

        post = None

        if utfflag is True:
            title = d('div#titsize> strong').text().strip()
        else:
            title = d('div.titmain> h1').text().strip()
            # logger.info("title: %s", title)
            if title is None or title.strip() == "":
                title = d('div.texttitbox> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)

        # try:
        #     brief = d('div.daodu> p').text().strip().replace("【数据猿导读】","")
        # except:
        #     brief = None
        brief = None

        try:
            if utfflag is True:
                post_time = d("p.time> span.mh-title").text().strip()
            else:
                post_time = d("meta[property='og:release_date']").attr(
                    "content").split("+")[0]

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if utfflag is True:
            article = d('div.tbox.content').html()
        else:
            article = d('div.texttit_m1').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=20),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
コード例 #25
0
ファイル: baijing_news.py プロジェクト: yujiye/Codes
def process_news(column, newsurl, content, newspost, download_crawler, sort):
    if has_news_content(content):
        logger.info("here")
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1]

        type = TYPE
        if sort.find("投融资") >= 0:
            type = 60001
        category = None

        title = d('div.mod-head> h1').text().strip()

        if title is None or title == "":
            return
        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.replace(",", ",").split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)
        # #

        # newspost1 = d('div.article-main> div> img').attr("src")
        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        # post = d("meta[property='og:image']").attr("content")
        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time = d('span.time> time').text()
            logger.info(post_time)
            # if post_time == datetime.date.strftime(datetime.date.today(),'%Y-%m-%d'):
            #     news_time = datetime.datetime.now()
            # else:
            news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M")
        except:
            news_time = datetime.datetime.now()
        article = d('div.mod-body> div.content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s, %s, %s", key, title, news_time,
                    ":".join(tags), type, category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": [],
            "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            # if c["data"].find("◆END◆")>=0 or c["data"].find("…………………")>=0:
            #     break
            #
            # if c["data"].find("ACG 领域最具影响力的产业新媒体") >= 0 or c["data"].find("访问三文娱网站3wyu.com查看产业必读文章") >=0:
            #     continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        if title is not None and len(contents) > 0:
            # mid = collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
        # logger.info("*************DONE************* %s", mid)
    return
コード例 #26
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        # logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-2].replace(".html", "")

        type = TYPE

        title = d('h1.single-title').text().strip()

        newspost = d('header> img.wp-post-image').attr("src")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)
        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        # try:
        #    post_time = topic
        #
        #    logger.info(post_time)
        #    news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S")
        #    logger.info("news-time: %s", news_time)
        # except Exception, e:
        #     logger.info(e)
        news_time = datetime.datetime.now()

        article = d('section.post_content').html()
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        processStatus = 0
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["data"].find(
                    "Continue reading this story with a subscription to DealStreetAsia"
            ) >= 0:
                processStatus = -5
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        if processStatus != 0:
            dnews["processStatus"] = processStatus
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s | %s", nid, processStatus)
            pass
    return
コード例 #27
0
ファイル: wuxiapptec_news.py プロジェクト: yujiye/Codes
def process_news(newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        logger.info('here.')

        d = pq(html.fromstring(content.decode("utf-8", 'ignore')))

        category = None
        categoryNames = []
        Type = TYPE
        tags = []
        brief = None

        title = d('h1').text().strip()
        if title is None or title == "":
            return
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({'title': title}) is not None:
            mongo.close()
            return

        key = d('article').attr('id').strip().split('-')[-1]

        try:
            (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                newspost, download_crawler, SOURCE, key, "news")
        except:
            posturl = None
        if posturl is not None:
            post = str(posturl)
        else:
            post = None
        try:
            post_time = d("header> div> span> time").text().strip()
            res = re.search(u'(\d{4})年(\d+)月(\d+)日', post_time)
            year = res.group(1)
            month = res.group(2)
            if len(month) == 1:
                month = '0' + month
            day = res.group(3)
            if len(day) == 1:
                day = '0' + day
            post_time = '{}-{}-{}'.format(year, month, day)
            news_time = extract.extracttime(post_time)

        except Exception as e:
            logger.info(e)
            news_time = datetime.datetime.now()
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.td-post-content').html()
        contents = extract.extractContents(newsurl, article, document=True)

        flag, domain = url_helper.get_domain(newsurl)

        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": Type,
            "original_tags": tags,
            "processStatus": 0,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames,
        }

        dcontents = []
        rank = 1

        for c in contents:
            if c['type'] == 'text':
                dc = {
                    'rank': rank,
                    'content': c['data'],
                    'image': '',
                    'image_src': '',
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1

        dnews['contents'] = dcontents

        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        logger.info(
            json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
コード例 #28
0
ファイル: toutiao4_news.py プロジェクト: yujiye/Codes
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8","ignore")))

        key = newsurl.split("/")[-1].replace("i","")

        type = TYPE

        category = None
        title = d('head> title').text().strip()

        r = "content: '(.*?)',.*groupId"

        result = util.re_get_result(r.strip()[:-1], content)
        (b,) = result
        logger.info(b)

        # exit()
        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.replace(",", ",").split(","):
                if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
                    tags.append(tag)



        post = None

        brief = None
        news_time = None
        try:
            r1 = "time: '(.*?)'.*},.*tagInfo"

            result = util.re_get_result(r1, content)
            (post_time,) = result
            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except:
            pass
        if news_time is None:
            news_time = datetime.datetime.now()
        # exit()
        # article = d('div.post> div.post-content').html()
        # contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": 60101,
            "domain": domain,
            "categoryNames": [],
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        bb = b.replace('&lt;', "<").replace("&gt;",">").replace("&quot;","\"").replace("&#x3D;","=")
        logger.info(bb)

        contents = extract.extractContents(newsurl, bb, document=False)
        for c in contents:
            logger.info(c["data"])
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        # for c in b.replace("&lt;div&gt;&lt;p&gt;",'').replace("&lt;/p&gt;&lt;/div&gt;","").split('&lt;/p&gt;&lt;p&gt;'):
        #     logger.info(c)
        #     if c.find("转载务必署名来源")>=0 or c.find("&lt;/p&gt;&lt;/div&gt;")>=0 or c.find("&lt;div&gt;&lt;p&gt; ")>=0:
        #         continue
        #     if c.find("img") >= 0:
        #         c = re.sub(r'&lt;(.*)?img.*&quot;0&quot;&gt;',"",c)
        #         dc = {
        #             "rank": rank,
        #             "content": c,
        #             "image": "",
        #             "image_src": "",
        #         }
        #     else:
        #         dc = {
        #             "rank": rank,
        #             "content": c,
        #             "image": "",
        #             "image_src": "",
        #         }
        #     # else:
        #     #     if download_crawler is None:
        #     #         dc = {
        #     #             "rank": rank,
        #     #             "content": "",
        #     #             "image": "",
        #     #             "image_src": c,
        #     #         }
        #     #     else:
        #     #         (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c, download_crawler, SOURCE, key, "news")
        #     #         if imgurl is not None:
        #     #             dc = {
        #     #                 "rank": rank,
        #     #                 "content": "",
        #     #                 "image": str(imgurl),
        #     #                 "image_src": "",
        #     #                 "height": int(height),
        #     #                 "width": int(width)
        #     #             }
        #     #         else:
        #     #             continue
        #
        #     logger.info(c)
        #     dcontents.append(dc)
        #     rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        mid = None
        if title is not None and len(dcontents) > 0:
            # mid = collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
        # logger.info("*************DONE*************%s",mid)
    return
コード例 #29
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        title = d('div.da-title> h2').text().strip()
        if title.find("融资") >= 0:
            type = 60001
            category = 60101

        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time = d("span.article-time").eq(0).text().strip()

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.data-article').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("btm地址") >= 0 or \
                    c["data"].find("版权声明") >= 0:
                continue

            if c["data"].find("8btctest1/custom/images") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
コード例 #30
0
ファイル: techcrunch_news.py プロジェクト: yujiye/Codes
def process_news(content, news_key, url, news_posttime):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode('utf-8')))
        title = d('header.article-header>h1').text().strip()
        if title is None or title.strip() == "":
            logger.info("wrong title for url: %s", url)
            return
        post_time = pq(content)("meta[name='sailthru.date']").attr("content")
        news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") + datetime.timedelta(hours=15)

        key = news_key
        try:
            postraw = pq(content)("meta[property='og:image']").attr("content")
            if postraw.find("techcrunch.opengraph.default.png")>=0:
                postraw = None
        except:
            postraw = None
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        divtags = d('div.tags> div.tag-item')
        tags = [pq(divtag)('a.tag').text().strip() for divtag in divtags if pq(divtag)('a.tag').text().strip() is not None]
        category = None
        logger.info("%s, %s, %s, %s, %s -> %s", key, title, post_time, news_time, ":".join(tags),category)

        article = d('div.article-entry.text').html()
        # logger.info(article)
        contents = extract.extractContents(url, article)

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None:
            mongo.close()
            return
            # collection_news.delete_one({"source": SOURCE, "key_int": int(key)})
        if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
            mongo.close()
            return
            # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        mongo.close()

        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "" or post.find("techcrunch.opengraph.default.png")>=0:
        #     post = util.get_poster_from_news(dcontents)
        #
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        if len(dcontents) > 0:
            # mongo = db.connect_mongo()
            # collection_news = mongo.article.news
            # collection_news.insert(dnews)
            # mongo.close()
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)

        logger.info("Done")