Example #1
0
def run():
    global raw_urls
    while True:
        if len(raw_urls) == 0:
            return
        url = raw_urls.pop(0)
        item = collection.find_one({"url": url})
        if item is not None:
            continue

        flag, domain = url_helper.get_domain(url)

        result = website.get_meta_info(url)
        logger.info(url)
        logger.info(
            json.dumps(result, ensure_ascii=False, cls=util.CJsonEncoder))
        if result is None:
            result = {"url": url, "httpcode": 404}
        else:
            if result["url"] != result["redirect_url"]:
                new_url = url_helper.url_normalize(result["redirect_url"])
                flag1, domain1 = url_helper.get_domain(new_url)
                if domain != domain1:
                    raw_urls.append(new_url)
        result["createTime"] = datetime.datetime.now()
        result["modifyTime"] = result["createTime"]
        try:
            collection.insert(result)
        except:
            pass
Example #2
0
def save(collection_market, appmarket, item):
    item["website"] = url_helper.url_normalize(item["website"])
    flag, domain = url_helper.get_domain(item["website"])
    if flag:
        item["website_domain"] = domain
    else:
        item["website_domain"] = None

    temp = "http://" + ".".join(item["apkname"].split(".")[::-1])
    flag, domain = url_helper.get_domain(temp)
    item["apkname_domain"] = domain

    record = collection_market.find_one(
        {
            "appmarket": appmarket,
            "apkname": item["apkname"]
        },
        projection={'histories': False})
    if record:
        _id = record.pop("_id")
        record.pop("key")
        record.pop("key_int")
        #logger.info(json.dumps(record, ensure_ascii=False, cls=util.CJsonEncoder))
        if item["version"] is not None and item["version"].strip() != "":
            if record["version"] is not None and record["version"].strip(
            ) != "" and LooseVersion(item["version"]) > LooseVersion(
                    record["version"]):
                item["createTime"] = record["createTime"]
                item["modifyTime"] = datetime.datetime.now()
                if item["updateDate"] is None:
                    item["updateDate"] = datetime.datetime.now()
                collection_market.update_one({"_id": _id}, {
                    '$set': item,
                    '$addToSet': {
                        "histories": record
                    }
                })
            elif record["version"] is None or record["version"].strip(
            ) == "" or LooseVersion(item["version"]) == LooseVersion(
                    record["version"]):
                item["modifyTime"] = datetime.datetime.now()
                collection_market.update_one({"_id": _id}, {'$set': item})
    else:
        item["createTime"] = datetime.datetime.now()
        item["modifyTime"] = item["createTime"]
        if item["updateDate"] is None:
            item["updateDate"] = datetime.datetime.now()
        try:
            collection_market.insert(item)
        except Exception, e:
            logger.info(e)
Example #3
0
def count_domains(apps, item_of_url):
    domains = {}
    for app in apps:
        url = app.get(item_of_url)
        flag, domain = url_helper.get_domain(url)
        if flag is not None and domain is not None:
            domains[domain] = 1
    return len(domains)
Example #4
0
def get_meta_info(url):
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9'
    headers = {
        'User-Agent': user_agent,
        'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Accept-Encoding': 'gzip'
    }
    try:
        request = urllib2.Request(url, None, headers)
    except:
        return None
    opener = urllib2.build_opener()
    retries = 0
    while True:
        try:
            r = opener.open(request, timeout=17)
            if r.info().get('Content-Encoding') == 'gzip':
                buf = StringIO(r.read())
                f = gzip.GzipFile(fileobj=buf)
                data = f.read()
            else:
                data = r.read()
            content = util.html_encode(data)
            redirect_url = url_helper.url_normalize(r.geturl())
            #logger.info(redirect_url)
            #logger.info(content)
            d = pq(html.fromstring(content))
            title = d("title").text()
            #logger.info(title)
            keywords = d("meta[name='keywords']").attr("content")
            if keywords is None:
                keywords = d("meta[name='Keywords']").attr("content")
            #logger.info(keywords)
            description = d("meta[name='description']").attr("content")
            if description is None:
                description = d("meta[name='Description']").attr("content")
            #logger.info(description)

            flag, domain = url_helper.get_domain(url)
            if flag is not True:
                domain = None
            return {
                "url": url,
                "redirect_url": redirect_url,
                "domain": domain,
                "title": title,
                "tags": keywords,
                "description": description,
                "httpcode": 200
            }
            break
        except:
            retries += 1
        if retries >= 3:
            return None
    return None
def find_link(link, source, sourceId):
    if link is None:
        return True
    if link.strip() == "":
        return True

    artifact = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_artifact": {"$elemMatch": {"type": 4010, "link": link}}})
    if artifact is None:
        flag, domain = url_helper.get_domain(link)
        if domain is not None:
            artifact = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_artifact": {"$elemMatch": {"type": 4010, "domain": domain}}})

    if artifact is None:
        return False
    else:
        return True
Example #6
0
def check_dup(websites, pattern):
    global Md
    global Ma
    global Mc
    linksmap = {}
    for website in websites:
        if website["link"] is not None and website["link"].strip() != "":
            linksmap.setdefault(website["link"].strip(), []).append(website)
            if website["domain"] is None or website["domain"].strip() == "":
                flag, domain = url_helper.get_domain(website["link"])
                website["domain"] = domain
                logger.info("Website Missing domain for :%s , %s",
                            website["id"], website["companyId"])
                fullFill(website)
                Md += 1
        else:
            logger.info("Website Missing link for :%s , %s", website["id"],
                        website["companyId"])
            remove_dup([website["id"]])
            Ma += 1

    dups = []
    for link in linksmap:
        if len(linksmap[link]) < 2: continue
        maxscroe = 0
        remainId = None
        allIds = []
        Mc += 1
        for web in linksmap[link]:
            allIds.append(web["id"])
            logger.info("DUP: %s: %s /%s /%s", web["id"], web["link"],
                        web["createTime"], web["companyId"])
            score = len([
                column for column in scores
                if web[column] is not None and str(web[column]).strip() != ""
                and str(web[column]).strip() != "0"
            ])
            if remainId is None:
                remainId = web["id"]
                maxscroe = score
            elif score > maxscroe:
                remainId = web["id"]
        logger.info("Remain: %s", remainId)
        dups.extend([id for id in allIds if id != remainId])
    if len(dups) > 0:
        logger.info("Remove: %s", dups)
        remove_dup(dups)
Example #7
0
def process(item):
    logger.info("process: %s, %s", item["id"], item["name"])
    deal_id = item["dealId"]
    if deal_id is None:
        set_deal_artifact_new_proceed(item["id"], "F")
        return

    conn = db.connect_torndb()
    deal = conn.get("select * from deal where id=%s", deal_id)
    conn.close()
    if deal is None:
        set_deal_artifact_new_proceed(item["id"], "F")
        return

    company_id = deal["companyId"]

    conn = db.connect_torndb()
    sc = conn.get(
        "select * from source_company where companyId=%s and source=13001 and sourceId=%s",
        company_id, str(deal_id))
    if sc is None:
        source_company_id = conn.insert(
            "insert source_company(companyId,source,sourceId,createTime,processStatus) "
            "values(%s,%s,%s,now(),%s)", company_id, 13001, str(deal_id), 2)
    else:
        source_company_id = sc["id"]

    if item["sourceArtifactId"] is None:
        link = item["link"]
        domain = None
        if item["type"] == 4010:
            link = url_helper.url_normalize(link)
            flag, domain = url_helper.get_domain(link)
            if flag is False:
                domain = None

        sourceArtifactId = conn.insert(
            "insert source_artifact(sourceCompanyId,name,description,link,domain,type,createTime) "
            "values(%s,%s,%s,%s,%s,%s,now())", source_company_id, item["name"],
            item["description"], link, domain, item["type"])
        conn.update(
            "update deal_artifact_new set sourceArtifactId=%s, proceed='Y' where id=%s",
            sourceArtifactId, item["id"])
        conn.update("update source_company set processStatus=0 where id=%s",
                    source_company_id)
    conn.close()
Example #8
0
def parser(item):
    if item is None:
        return None

    investor_key = item["key"]

    html = item["content"]
    #logger.info(html)
    d = pq(html)
    investor_name = d('div.picinfo> p> span.title').text()
    investor_name = name_helper.company_name_normalize(investor_name)
    logger.info("investor_name: " + investor_name)

    if investor_name is None:
        logger.info("No investor name!!!")
        return None

    logo = d('div.pic> img').attr("src")
    if logo is not None:
        logo = logo.strip()
    logger.info("Investor Logo: %s" % logo)

    website = d('span.links >a[target="_black"]').attr("href")
    if website is None or website.strip() == "暂无":
        website = None

    website = url_helper.url_normalize(website)
    flag, domain = url_helper.get_domain(website)
    if flag is None:
        website = None

    logger.info("Investor website: %s" % website)

    stageStr = d('div.pad.block> div.list-tags.yellow').text().replace(
        " ", ",").strip()
    logger.info("Investor rounds: %s" % stageStr)

    fieldsStr = d('div.pad.block> div.list-tags.darkblue').text().replace(
        " ", ",").strip()
    logger.info("Investor fields: %s" % fieldsStr)

    desc = d('div.des').text().strip()
    logger.info("Investor desc: %s" % desc)

    return investor_key, investor_name, logo, website, stageStr, fieldsStr, desc
def save_androidWebsite_artifact(app, source, sourceId):
    url = app["website"]
    flag, domain = url_helper.get_domain(url)
    if flag is not True:
        return None
    if find_link(url, source, sourceId):
        return None

    try:
        andwebsadata = {
            "name": app["name"],
            "description": app["description"],
            "link": app["website"],
            "type": 4010,
            "domain": app["website_domain"],
            "extended": 'Y',
        }
        save_mongo_source_artifact(source, sourceId, andwebsadata)
        return 1
    except:
        return None
Example #10
0
def save_itunesSellerUrl_artifact(app, source, sourceId):
    url = app["sellerUrl"]
    flag, domain = url_helper.get_domain(url)
    if flag is not True:
        return None

    if find_link(app["sellerUrl"], source, sourceId):
        return None

    try:
        itunessellersadata = {
            "name": app["sellerName"],
            "description": app["description"],
            "link": app["sellerUrl"],
            "type": 4010,
            "domain": app["sellerDomain"],
            "extended": 'Y',
        }
        save_mongo_source_artifact(source, sourceId, itunessellersadata)
        return 1
    except:
        return None
Example #11
0
def process(g, crawler, url, key, content):
    if has_content(content):
        #logger.info(content)
        main = pq(content)('div.article_content')
        d = pq(main)
        title = d('h1#article_title').text()
        brief = pq(content)("meta[name='description']").attr("content")
        # post_time =pq(content)("meta[property='article:published_time']").attr("content").split("+")[0]
        # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S")
        result = util.re_get_result("var publishTime = new Date\(\"(.*?)\"\)",
                                    content)
        if result:
            post_time, = result
            news_time = datetime.datetime.strptime(post_time,
                                                   "%Y-%m-%d %H:%M:%S")
        else:
            logger.info("incorrcet post time")
            logger.info(content)
            exit()
            return

        contents = extract.extractContents(url, content)
        if title.find("融资") >= 0 or title.find("获投") >= 0:
            category = 60101
        else:
            category = None
        tags = []

        articletags = pq(content)("meta[name='keywords']").attr("content")
        if articletags is None:
            logger.info(content)
        else:
            for tag in articletags.split():
                if tag is not None and tag.strip() != "" and tag not in tags:
                    tags.append(tag)

        logger.info("%s, %s, %s, %s, %s", key, title, news_time, category,
                    ":".join(tags))
        #logger.info(news_time)
        #logger.info(contents)
        # for t in contents:
        #     logger.info(t["data"])

        #item = collection_news.find_one({"source": g.SOURCE, "key_int": int(key)})
        craw = True
        #2016-10-01 pencilnews website upgrade, news keys changed! Have to redownload article with new keys
        if collection_news.find_one({
                "source": g.SOURCE,
                "key_int": int(key)
        }) is not None:
            cnews = collection_news.find_one({
                "source": g.SOURCE,
                "key_int": int(key)
            })
            logger.info("%s, %s", url, cnews["link"])
            if url == cnews["link"]:
                craw = False
            else:
                collection_news.delete_many({
                    "source": g.SOURCE,
                    "key_int": int(key)
                })
                logger.info("different link!")

        if craw:
            newses = list(
                collection_news.find({
                    "title": title,
                    "source": {
                        "$ne": g.SOURCE
                    }
                }))
            for news in newses:
                if news.has_key("type") and news["type"] > 0:
                    craw = False
                    break
        if craw:
            if collection_news.find_one({
                    "title": title,
                    "source": {
                        "$ne": g.SOURCE
                    }
            }) is not None:
                collection_news.delete_many({
                    "title": title,
                    "source": {
                        "$ne": g.SOURCE
                    }
                })
            flag, domain = url_helper.get_domain(url)
            dnews = {
                "date": news_time - datetime.timedelta(hours=8),
                "title": title,
                "link": url,
                "createTime": datetime.datetime.now(),
                "source": g.SOURCE,
                "key": key,
                "key_int": int(key),
                "type": TYPE,
                "original_tags": tags,
                "processStatus": 0,
                "companyId": None,
                "companyIds": [],
                "category": category,
                "domain": domain
            }

            dcontents = []
            rank = 1
            for c in contents:
                if c["data"] == "/The End/":
                    break
                if c["type"] == "text":
                    dc = {
                        "rank": rank,
                        "content": c["data"],
                        "image": "",
                        "image_src": "",
                    }
                else:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                dcontents.append(dc)
                rank += 1
            dnews["contents"] = dcontents
            if brief is None or brief.strip() == "":
                brief = util.get_brief_from_news(dcontents)
            post = util.get_poster_from_news(dcontents)
            dnews["post"] = post
            dnews["brief"] = brief
            if news_time > datetime.datetime.now():
                logger.info("Time: %s is not correct with current time",
                            news_time)
                dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                    hours=8)
            collection_news.insert(dnews)
            logger.info("*************DONE**************")

        g.latestIncr()
Example #12
0
def parse_investor(item):

    logger.info("*** investfirm ***")

    investor_key = item["key"]
    html = item["content"]
    logger.info(investor_key)
    d = pq(html)

    logo = d('.logo-block > img').attr('src')

    if logo == "http://assets3.chuangyepu.com/chuangyepu/images/big-screenshot.png":
        logo = None
    basic_info = d('div.col-md-9> div> table> tr> td').eq(1)
    #logger.info(logo)
    name = pq(basic_info)('div.name').text().strip()
    if name is None:
        logger.info("No investor name!!!")
        return None
    desc = pq(basic_info)('div.desc').eq(0).text().strip()
    #logger.info(name+" "+desc)
    try:
        website = pq(basic_info)('div').eq(2)('a').text().strip()
    except:
        website = None

    if website is None or website.strip() == "暂无":
        website = None

    website = url_helper.url_normalize(website)
    flag, domain = url_helper.get_domain(website)
    if flag is None:
        website = None

    #logger.info(website)

    main_blocks = d('div.col-md-3> div.col-sm-12')
    #no js data
    #
    # for block in main_blocks:
    #     info = pq(block)
    #     h4 = info('h4.list_title').text().strip()
    #     logger.info(h4)
    #
    #     if h4 == "投资行业分布图":
    #         field = info('g.highcharts-axis-labels').text().strip()

    source_investor = {
        "name": name,
        "website": website,
        "description": desc,
        "logo_url": logo,
        "stage": None,
        "field": None,
        "type": 10020,
        "source": SOURCE,
        "sourceId": investor_key
    }
    logger.info(
        json.dumps(source_investor, ensure_ascii=False, cls=util.CJsonEncoder))

    return source_investor
Example #13
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].strip().replace(".html", "").replace(
            'detail_', '')

        type = TYPE

        category = None

        title = d('div.left.zb-n> h1').text().strip()

        tags = []

        postraw = newspost
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        # brief = d("meta[name='description']").attr("content").replace(u'一鸣网——让发生的发声|智慧共享新媒体平台|上海TMT媒体开创者、一鸣网ymtmt.com','')
        brief = d('div.left.zb-n> p.gy').text().strip()
        news_time = datetime.datetime.now()

        article = d('div.left.zb-n').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            logger.info('already exists %s', title)
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        start = False
        for c in contents:
            if start is False and c["data"].find(
                    brief) >= 0 and c["data"].find(title) >= 0:
                start = True
                continue
            if start is False:
                continue

            if c["data"].find("-END-") >= 0:
                break
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        mongo.close()
        # logger.info("*************DONE*************")
    else:
        logger.info('has no news content %s', newsurl)
    return
Example #14
0
def process_news(column, newsurl, content, newspost, download_crawler):
    # if has_news_content(content):
    if 1:
        # logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode('gb2312', 'ignore')))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".htm", "")

        title = d('h1.title').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)

        brief = None

        news_time = d('.timer').text()
        news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M:%S')

        article = d('.content').html()
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            # "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        mongo.close()

        if news_classify.get_class(dcontents, 13866) == 1:
            logger.info('%s is fundingNews', title)
            TYPE = 60001
        else:
            TYPE = 60010
            logger.info('%s is not fundingNews', title)

        dnews['type'] = TYPE

        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
    return
Example #15
0
def process_news(newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        logger.info('here.')

        d = pq(html.fromstring(content.decode("utf-8", 'ignore')))

        category = None
        categoryNames = []
        Type = TYPE
        tags = []
        brief = None

        title = d('h1').text().strip()
        if title is None or title == "":
            return
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({'title': title}) is not None:
            mongo.close()
            return

        key = d('article').attr('id').strip().split('-')[-1]

        try:
            (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                newspost, download_crawler, SOURCE, key, "news")
        except:
            posturl = None
        if posturl is not None:
            post = str(posturl)
        else:
            post = None
        try:
            post_time = d("header> div> span> time").text().strip()
            res = re.search(u'(\d{4})年(\d+)月(\d+)日', post_time)
            year = res.group(1)
            month = res.group(2)
            if len(month) == 1:
                month = '0' + month
            day = res.group(3)
            if len(day) == 1:
                day = '0' + day
            post_time = '{}-{}-{}'.format(year, month, day)
            news_time = extract.extracttime(post_time)

        except Exception as e:
            logger.info(e)
            news_time = datetime.datetime.now()
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.td-post-content').html()
        contents = extract.extractContents(newsurl, article, document=True)

        flag, domain = url_helper.get_domain(newsurl)

        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": Type,
            "original_tags": tags,
            "processStatus": 0,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames,
        }

        dcontents = []
        rank = 1

        for c in contents:
            if c['type'] == 'text':
                dc = {
                    'rank': rank,
                    'content': c['data'],
                    'image': '',
                    'image_src': '',
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1

        dnews['contents'] = dcontents

        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        logger.info(
            json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
Example #16
0
def process_news(column, newsurl, content, newspost, download_crawler, sort):
    if has_news_content(content):
        logger.info("here")
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1]

        type = TYPE
        if sort.find("投融资") >= 0:
            type = 60001
        category = None

        title = d('div.mod-head> h1').text().strip()

        if title is None or title == "":
            return
        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.replace(",", ",").split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)
        # #

        # newspost1 = d('div.article-main> div> img').attr("src")
        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        # post = d("meta[property='og:image']").attr("content")
        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time = d('span.time> time').text()
            logger.info(post_time)
            # if post_time == datetime.date.strftime(datetime.date.today(),'%Y-%m-%d'):
            #     news_time = datetime.datetime.now()
            # else:
            news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M")
        except:
            news_time = datetime.datetime.now()
        article = d('div.mod-body> div.content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s, %s, %s", key, title, news_time,
                    ":".join(tags), type, category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": [],
            "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            # if c["data"].find("◆END◆")>=0 or c["data"].find("…………………")>=0:
            #     break
            #
            # if c["data"].find("ACG 领域最具影响力的产业新媒体") >= 0 or c["data"].find("访问三文娱网站3wyu.com查看产业必读文章") >=0:
            #     continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        if title is not None and len(contents) > 0:
            # mid = collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
        # logger.info("*************DONE************* %s", mid)
    return
Example #17
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].strip().replace(".shtml", "")

        type = TYPE

        category = None

        title = d('div.subject> h1').text().strip()

        tags = []

        post = newspost

        brief = d("meta[name='description']").attr("content")

        post_time = d('div.meta> span.meta-date').text().replace("发布", "")
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.subject> div.subject-content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    imgurl = parser_mysql_util.get_logo_id(
                        c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                        }
                    else:
                        continue

            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        collection_news.insert(dnews)
        mongo.close()
        logger.info("*************DONE*************")
    return
Example #18
0
def save_itunes(response, data):
    global total
    if response.error:
        logger.info("Error: %s, %s" % (response.error, response.request.url))
        # request(response.request.url, lambda r, data=data: save_itunes(r,data))
        # return
    else:
        try:
            html = response.body
            d = pq(html)
            developer = d(".product-header__identity> a").text()
            if developer is not None:
                developer = developer.replace("开发商:", "")
            data["developer"] = developer

            supportUrl = None
            links = d('li.t-subbody>a.targeted-link.link.icon')
            for i in links:
                title = pq(i).text().strip()
                if title.endswith("支持"):
                    supportUrl = pq(i).attr('href').strip()
            data["supportUrl"] = url_helper.url_normalize(supportUrl)

            logger.info("********************Developer: %s->supportUrl: %s",
                        data["developer"], data["supportUrl"])

            relatedApps = []
            try:
                # divs = d('div.swoosh')
                # for div in divs:
                #     e = pq(div)
                #     if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有":
                #         apps = e('div.content> div> div.application')
                #         for app in apps:
                #             app_id = pq(app).attr('adam-id')
                #             relatedApps.append(int(app_id))
                #logger.info("*********************%s", app_id)
                apps = d('div.l-row.l-row--peek> a')
                for app in apps:
                    appurl = pq(app).attr('href')
                    r = util.re_get_result('/id(\d*)', appurl)
                    if r is not None:

                        track_id, = r
                        try:
                            app_id = int(track_id)
                            relatedApps.append(int(app_id))
                        except:
                            pass
            except:
                pass
            logger.info("*********************%s", relatedApps)
            data["relatedApps"] = relatedApps

            userComments = []
            cdivs = d('div.l-row.l-row--peek> div.ember-view')
            for cdiv in cdivs:
                c = pq(cdiv)
                try:
                    c_title = c(
                        'div.we-customer-review> div.we-customer-review__header> h3'
                    ).eq(1).text().strip()
                    c_commentator = c('div.we-customer-review__user').eq(
                        1).text().replace("评论人:", "").strip()
                    c_content = c('p.we-customer-review__body').attr(
                        "aria-label")

                    comment = {
                        "title": c_title,
                        "commentator": c_commentator,
                        "content": c_content
                    }
                    userComments.append(comment)

                except:
                    pass

            logger.info(
                json.dumps(userComments,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            data["userComments"] = userComments

            if data["supportUrl"] is not None:
                flag, domain = url_helper.get_domain(data["supportUrl"])
                if flag:
                    data["supportDomain"] = domain
                else:
                    data["supportDomain"] = None
            if data.has_key("sellerUrl") and data["sellerUrl"] is not None:
                data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"])
                flag, domain = url_helper.get_domain(data["sellerUrl"])
                if flag:
                    data["sellerDomain"] = domain
                else:
                    data["sellerDomain"] = None

            short_name = name_helper.get_short_name(data["trackName"])
            data["trackShortName"] = short_name
            logger.info(
                json.dumps(data, ensure_ascii=False, cls=util.CJsonEncoder))

            record = collection_itunes.find_one(
                {"trackId": data["trackId"]}, projection={'histories': False})
            if record:
                _id = record.pop("_id")
                if LooseVersion(data["version"]) > LooseVersion(
                        record["version"]):
                    data["createTime"] = record["createTime"]
                    data["modifyTime"] = datetime.datetime.now()
                    collection_itunes.update_one({"_id": _id}, {
                        '$set': data,
                        '$addToSet': {
                            "histories": record
                        }
                    })
                # elif LooseVersion(data["version"]) == LooseVersion(record["version"]):
                #     data["modifyTime"] = datetime.datetime.now()
                #     collection_itunes.update_one({"_id": _id}, {'$set': data})
            else:
                data["createTime"] = datetime.datetime.now()
                data["modifyTime"] = data["createTime"]
                collection_itunes.insert(data)

        except:
            traceback.print_exc()

    total -= 1
    if total <= 0:
        begin()
Example #19
0
def parse_company(item):
    logger.info("parse_company")
    company_key = item["postdata"]["id"]

    #company basic info
    c = item["data"]["basic"]

    tags = c["tags"]

    tags_str = tags.replace("|",",")

    logo=c["icon"]
    if logo.find("product_default.png") >= 0:
        logo = None

    establish_date = None
    if c.has_key("open_time"):
        try:
            establish_date = datetime.datetime.strptime(c["open_time"], "%Y-%m-%d")
        except:
            pass

    address1 = None
    address2 = None
    if c.has_key("city"):
        address2 = c["city"]
    if c.has_key("province"):
        address1 = c["province"]

    location_id = 0
    if address2!=None and address2.strip()!="":
        location = parser_db_util.get_location(address2)
        if location != None:
            location_id= location["locationId"]

    if location_id==0 and address1 != None and address1.strip()!="":
        location = parser_db_util.get_location(address1)
        if location != None:
            location_id = location["locationId"]

    fullName = c["company"]
    if fullName is None or fullName.strip() == "":
        fullName = None
    else:
        fullName = fullName.replace("_","")
        idx = fullName.rfind(u"公司")
        if idx != -1:
            fullName = fullName[:(idx+len(u"公司"))]
        fullName = name_helper.company_name_normalize(fullName)

    name = c["product"]
    desc = ""
    brief = ""
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None
    otherDesc = None


    if c.has_key("desc"):  # 其他
        # otherDesc = c["intro"].strip()
        desc = c["desc"].strip()

    if c.has_key("yewu"):  # 其他
        # otherDesc = c["intro"].strip()
        brief = c["yewu"].strip()

    if name is None or fullName is None:
        return {
            "status": "No_Name",
        }

    artifacts = []
    websites = []
    if c.has_key("gw_link") is True and c["gw_link"].strip() !="" and c["gw_link"] not in websites:
        websites.append(c["gw_link"])
    if c.has_key("source_gw_link") is True and c["source_gw_link"].strip() != "" and c["source_gw_link"] not in websites:
        websites.append(c["source_gw_link"])
    if item["data"].has_key("productinfos") is True:
        for pi in item["data"]["productinfos"]:
            if pi.has_key("link") is True and pi["link"].strip() !="" and pi["link"] not in websites:
                websites.append(pi["link"])

    for website in websites:
        type, app_market, app_id = url_helper.get_market(website)
        if type == 4010:
            if item["url"] != website and website.find("qimingpian.com") == -1:
                flag, domain = url_helper.get_domain(website)
                if flag is not None:
                    if flag is False:
                        domain = None
                    artifacts.append({
                        "type": 4010,
                        "name": name,
                        "description": brief,
                        "link": website,
                        "domain": domain
                    })
        elif type == 4020 or type == 4030:
            domain = None
            if domain is not None:
                artifacts.append({
                    "type": type,
                    "name": name,
                    "description": brief,
                    "link": website,
                    "domain": domain
                })
        elif type == 4040:
            domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4040,
                    "name": name,
                    "description": brief,
                    "link": website,
                    "domain": domain
                })
        elif type == 4050:
            domain = None
            if app_market == 16010 or app_market == 16020:
                android_app = parser_db_util.find_android_market(app_market, app_id)
                if android_app:
                    domain = android_app["apkname"]
            else:
                domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4050,
                    "name": name,
                    "description": brief,
                    "link": website,
                    "domain": domain
                })

    return {
        "name": name,
        "fullName": fullName,
        "description": desc,
        "productDesc": productDesc,
        "modelDesc": modelDesc,
        "operationDesc": operationDesc,
        "teamDesc": teamDesc,
        "marketDesc": marketDesc,
        "compititorDesc": compititorDesc,
        "advantageDesc": advantageDesc,
        "planDesc": planDesc,
        "otherDesc": otherDesc,
        "brief": brief,
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": establish_date,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None,
        "artifacts": artifacts,

    }
Example #20
0
def process_news(column, newsurl, content, newspost, download_crawler):
    logger.info('starting process_news %s', newsurl)
    # if has_news_content(content):
    if 1:
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1]

        # type = TYPE

        category = None

        title = d('.article_title p').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            logger.info('title:%s already exists' % title)
            return

        tags = []
        articletags = d(".labs a").text().strip()
        if articletags is not None:
            for tag in articletags.split():
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        postraw = newspost
        # post = d('div#post_thumbnail> img').attr("src")
        # if post is not None:
        #     post = "http://vcbeat.com"+ post

        brief = None
        # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip()

        # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content)
        news_time = d('.time').text().strip()
        news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M')
        # dt = datetime.date.today()
        # today = datetime.datetime.now()
        # if news_time is None or news_time > today:
        #     news_time = datetime.datetime.now()

        article = d('.art_text').html()
        contents = extract.extractContents(newsurl, article, document=False)
        # if len(contents)==0:
        #     contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, postraw)
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     logger.info( 'title:%s already exists'%title)
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": TYPE,
            "original_tags": None,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": None,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        dnews["brief"] = brief

        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        # update link content with oldId
        item = collection_news.find_one({"link": newsurl})
        if item is None:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
        else:
            logger.info("update %s", newsurl)
            # collection_news.update_many({'link': newsurl},{'$set': dnews})

            # oldId = collection_news.find_one({"link": newsurl})['_id']
            # collection_news.delete_one({"link": newsurl})
            # dnews['_id']=oldId
            # collection_news.insert(dnews)
        mongo.close()
        logger.info("*************DONE*************")

    return
Example #21
0
def process_news(content, url):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))
        download_crawler = download.DownloadCrawler(use_proxy=False)
        title = d(
            'div.post-img-left> div> div.post-head> h1.title').text().strip()
        post_time = d('article.post-article').attr("ptime")
        post_Date = time.localtime(int(post_time))
        news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon,
                                      post_Date.tm_mday, post_Date.tm_hour,
                                      post_Date.tm_min, post_Date.tm_sec)

        if collection_news.find_one({"link": url}) is not None:
            return
            # collection_news.delete_one({"link": url})

        if collection_news.find_one({
                "title": title,
                "source": {
                    "$ne": SOURCE
                }
        }) is not None:
            return

        key = d('article.post-article').attr("postid")
        try:
            key_int = int(key)
        except:
            key_int = None
        column = d('span.post-category').text().strip()
        brief = d("meta[name='description']").attr("content").strip()

        if column is not None:
            tags = column.split()
        else:
            tags = []

        categoryNames = []
        if "人物" in tags:
            category = 60103
        elif "公司" in tags:
            category = 60105
            categoryNames.append("大公司")
        else:
            category = None

        keywords = d("meta[name='keywords']").attr("content")
        if keywords is not None:
            for keyword in keywords.split(","):
                if keyword is not None and keyword.strip(
                ) not in tags and keyword.strip() not in ["PingWest", "品玩"]:
                    tags.append(keyword.strip())

        postraw = d("link[rel='image_src']").attr("href")
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time,
                    news_time, brief, ":".join(tags), category, post)
        article = d('div.box-con> div#sc-container').html()
        # logger.info(article)
        contents = extract.extractContents(url, article)

        # if collection_news.find_one({"link": url}) is not None:
        #     return
        #     # collection_news.delete_one({"link": url})
        #
        # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
        #     return
        # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=16),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": key_int,
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []

        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""),
                # }
                if download_crawler is None:
                    dc = {
                        "rank":
                        rank,
                        "content":
                        "",
                        "image":
                        "",
                        "image_src":
                        c["data"].replace("?imageView2/2/w/750/q/90", ""),
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"].replace("?imageView2/2/w/750/q/90", ""),
                         download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Example #22
0
def handle_lookup_result(response, app, date_num):
    global total
    if response.error:
        logger.info("Error: %s, %s" % (response.error, response.request.url))
        logger.info("Last Total number of current patch: %s", total)
        request(response.request.url,
                lambda r, app=app, date_num=date_num: handle_lookup_result(
                    r, app, date_num))
        return
    else:
        logger.info("Getting result from url: %s", response.request.url)
        trackId = int(app["domain"])
        try:
            data = json.loads(response.body)
            if data["resultCount"] > 0:
                for result in data["results"]:
                    if result.get("trackId") == trackId:
                        score = result.get("averageUserRating")
                        comment = result.get("userRatingCount")
                        logger.info(
                            "companyId=%s, artifactId=%s, score=%s, comment=%s, date_num=%s"
                            % (app["companyId"], app["id"], score, comment,
                               date_num))

                        if score is not None or comment is not None:
                            save_comment(app["trackId"], score, comment)

                        logger.info("Last Total number of current patch: %s",
                                    total)

                        if result.has_key("sellerUrl") and result[
                                "sellerUrl"] is not None:
                            result["sellerUrl"] = url_helper.url_normalize(
                                result["sellerUrl"])
                            flag, domain = url_helper.get_domain(
                                result["sellerUrl"])
                            if flag:
                                result["sellerDomain"] = domain
                            else:
                                result["sellerDomain"] = None

                        short_name = name_helper.get_short_name(
                            result["trackName"])
                        result["trackShortName"] = short_name

                        record = collection_itunes.find_one(
                            {"trackId": result["trackId"]},
                            projection={'histories': False})
                        if record:
                            collection_itunes.update_one(
                                {"_id": record["_id"]}, {
                                    '$set': {
                                        "checkTime": datetime.datetime.now()
                                    }
                                })
                            if record.get("offline_itunes", None) == 'Y':
                                offrecord = {
                                    "offlineDetectTime":
                                    datetime.datetime.now(),
                                    "offline_itunes": 'N'
                                }
                                collection_itunes.update_one(
                                    {"_id": record["_id"]}, {
                                        '$set': {
                                            "offline_itunes":
                                            'N',
                                            "offlineitunesDetectTime":
                                            datetime.datetime.now()
                                        },
                                        '$addToSet': {
                                            "offline_itunes_histories":
                                            offrecord
                                        }
                                    })
                            _id = record.pop("_id")
                            if LooseVersion(result["version"]) > LooseVersion(
                                    record["version"]):
                                # if 1:
                                page_url = result.get("trackViewUrl").replace(
                                    "&uo=4", "")

                                if date_num == 6 and page_url is not None and page_url.strip(
                                ) != "":
                                    # only do it when date is 6/16/226
                                    logger.info(
                                        "Need to crawler page data: %s",
                                        page_url)
                                    total += 1
                                    request(page_url,
                                            lambda r, appdata=result:
                                            save_itunes(r, appdata))
                                else:
                                    logger.info(
                                        json.dumps(result,
                                                   ensure_ascii=False,
                                                   cls=util.CJsonEncoder))
                                    result["createTime"] = record["createTime"]
                                    result[
                                        "modifyTime"] = datetime.datetime.now(
                                        )
                                    collection_itunes.update_one(
                                        {"_id": _id}, {
                                            '$set': result,
                                            '$addToSet': {
                                                "histories": record
                                            }
                                        })
                        else:
                            result["createTime"] = datetime.datetime.now()
                            result["modifyTime"] = result["createTime"]
                            collection_itunes.insert(result)

                        break
            elif data["resultCount"] == 0:
                record = collection_itunes.find_one(
                    {"trackId": trackId}, projection={'histories': False})
                logger.info("***********Offline************")
                if record:
                    if record.get("offline_itunes",
                                  None) is None or record.get(
                                      "offline_itunes", None) == 'N':
                        offrecord = {
                            "offlineDetectTime": datetime.datetime.now(),
                            "offline_itunes": 'Y'
                        }
                        collection_itunes.update_one({"_id": record["_id"]}, {
                            '$set': {
                                "offline_itunes": 'Y',
                                "offlineitunesDetectTime":
                                datetime.datetime.now(),
                                "checkTime": datetime.datetime.now()
                            },
                            '$addToSet': {
                                "offline_itunes_histories": offrecord
                            }
                        })
                    else:
                        collection_itunes.update_one(
                            {"_id": record["_id"]},
                            {'$set': {
                                "checkTime": datetime.datetime.now()
                            }})
        except:
            traceback.print_exc()

    total -= 1
    if total <= 0:
        begin()
Example #23
0
    article = d(
        'div.pfcng-row-02> div.pfcng-col-2> div.pos-0> div.pane-content').html(
        )
    if article.find('<form') >= 0:
        form_str = re.search('<form(.*?)</form>', article).group(1)
        article = article.replace(form_str, '')
    # elif article.find('<iframe') >= 0:
    #     iframe_str = re.search('<iframe(.*?)</iframe>',article).group(1)
    #     article = article.replace(iframe_str, '')
    contents = extract.extractContents(newsurl, article, document=False)

    logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                ":".join(tags), category, brief)

    flag, domain = url_helper.get_domain(newsurl)
    dnews = {
        "date": news_time - datetime.timedelta(hours=8),
        "title": title,
        "link": newsurl,
        "createTime": datetime.datetime.now(),
        "source": SOURCE,
        "key": key,
        "key_int": None,
        "type": type,
        "original_tags": tags,
        "processStatus": 1,
        # "companyId": None,
        "companyIds": [],
        "category": category,
        "domain": domain,
Example #24
0
def process_news(item, url, content):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("gbk")))

        title = d(
            'div.g-main> div> div.m-cont-hd> div.title> h1').text().strip()
        datecontent = d(
            'div.g-main> div> div.m-cont-hd> div.m-info> div> div> div.box> div.origin'
        ).text().strip()
        result = util.re_get_result('(\d{4}\/.*?)$', datecontent)
        if result:
            post_time, = result
            news_time = datetime.datetime.strptime(post_time,
                                                   "%Y/%m/%d %H:%M:%S")
        else:
            post_time = None
            news_time = None

        key = item["key"]
        column = d('div.g-main> div> div.m-cont-hd> div.tag').text().strip()
        brief = d('div.g-article> div> div.review').text().strip()
        postraw = item["post"]
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        if column is not None:
            tags = column.split()
        else:
            tags = []

        logger.info("%s, %s, %s, %s, %s, %s", key, title, post_time, news_time,
                    brief, ":".join(tags))
        article = d('div.g-article> div.m-article').html()
        #logger.info(article)
        contents = extract.extractContents(url, article)

        if collection_news.find_one({"link": url}) is not None:
            return
            # collection_news.delete_one({"link": url})
        #
        # for t in contents:
        #    logger.info(t["data"])
        #    logger.info("")
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": None,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_posterId_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        # collection_news.insert(dnews)
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Example #25
0
def process_news(column, j_content, content, download_crawler):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = j_content['id']
        type = TYPE
        title = j_content['title']

        newspost = j_content.get('image')
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        brief = j_content['description']
        newsurl = j_content['uri']

        try:
            date = j_content['date']
            post_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(str(date)[:-3])))
            news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") - datetime.timedelta(days=1)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        article = d('div.article-container').html()
        contents = extract.extractContents(newsurl, article,document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief)
        flag, domain = url_helper.get_domain(newsurl)

        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                if c["data"].find("Share to facebookShare to twitterShare to linkedin") >= 0:
                    c['data'] = c['data'].replace('Share to facebookShare to twitterShare to linkedin', '')
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
Example #26
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("gbk")))

        key = newsurl.split("/")[-1].replace(".htm", "")

        type = TYPE

        category = None
        categoryNames = []

        title = d('div.hd> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        # post = d('div#post_thumbnail> img').attr("src")
        # postraw = newspost
        # # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        # (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        # if posturl is not None:
        #     post = str(posturl)
        # else:
        #     post = None

        post = None
        brief = d("meta[name='description']").attr("content")

        post_time = d('div.a_Info> span.a_time').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.bd> div.Cnt-Main-Article-QQ').html()
        contents = extract.extractContents(newsurl, article)
        # logger.info(contents)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
Example #27
0
def process_news(column, newsurl, content, newspost):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        key = content["news"]["id"]

        newsurl = "https://www.chinaventure.com.cn/cmsmodel/news/detail/%s.shtml" % key

        type = TYPE

        category = None
        categoryNames = []
        if content["news"].has_key("newsChannelId"):
            if content["news"]["newsChannelId"] == 52:
                category = 60101
                categoryNames.append("融资")

        if content["news"].has_key("tagName"):
            if content["news"]["tagName"] == '人物':
                category = 60103

        tags = []
        if content.has_key("keywordList") is True and len(
                content["keywordList"]) > 0:
            for tag in content["keywordList"]:
                if tag.has_key("keyword") and tag[
                        "keyword"] is not None and tag["keyword"].strip(
                        ) != "" and tag["keyword"] not in tags:
                    tags.append(tag["keyword"])

        title = content["news"]["title"].replace("&quot;", "\"")

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            logger.info(
                "***************************News existed!!!***********************"
            )
            mongo.close()
            return

        # post = d('div#post_thumbnail> img').attr("src")
        postraw = "http://pic.chinaventure.com.cn/" + content["news"][
            "coverImg"]
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = content["news"]["introduction"]

        post_time = content["news"]["updateAt"]

        news_time = extract.extracttime(str(post_time))
        if news_time is None:
            news_time = datetime.datetime.now()

        article = pq(content["news"]["content"]).html()
        contents = extract.extractContents(newsurl, article)
        # for c in contents:
        #     logger.info(c["data"])
        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, post)
        # return
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     logger.info("***************************News existed!!!***********************")
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("img.mp.itc.cn") >= 0:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        # dnews["post"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        # logger.info("*************DONE*************")
    return
 def test_get_domain(self):
     self.assertEqual('md.openapi.360.cn', uh.get_domain('http://md.openapi.360.cn/list/get'))
     self.assertEqual('short.weixin.qq.com', uh.get_domain('http://short.weixin.qq.com/cgi-bin/micromsg-bin/getcdndns'))
     self.assertEqual('inews.test.com.cn', uh.get_domain('http://inews.test.com.cn/redisTool?type=get&key=downloadNews_158008435%2CdownloadVideo_158008435'))
     self.assertEqual('127.0.0.1', uh.get_domain('http://127.0.0.1/redisTool?type=get&key=downloadNews_158008435%2CdownloadVideo_158008435'))
Example #29
0
def process_news(content, news_key, url):
    if has_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content))
        brief = d("meta[name='description']").attr("content").split(",")[-1]
        title = d('div#article> div.single-item> div.article-hd> h1').text().strip()
        pagetitle = d('head> title').text().strip()
        temp = pagetitle.split("-")[-2]
        categoryNames = []
        if temp.strip() == "初页":
            category = 60102
            categoryNames.append("产品")
        elif temp.strip() == 'IPO/并购':
            category = 60105
            categoryNames.append("大公司")
        else:
            category = None
        post_time = d('div.author-time> span.date-time').attr("data-time")
        post_date = time.localtime(int(post_time))
        news_time = datetime.datetime(post_date.tm_year, post_date.tm_mon, post_date.tm_mday, post_date.tm_hour,
                                      post_date.tm_min, post_date.tm_sec)
        key = news_key
        column = d('div.article-tags> a').text()
        tags = column.split()
        logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, post_time, news_time, temp,  category, ":".join(tags))
        article = d('div#article> div> div.article-content').html()
        # # logger.info(article)
        contents = extract.extractContents(url, article)

        if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None:
            return
            # collection_news.delete_one({"source": SOURCE, "key_int": int(key)})

        if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
            return
            # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})

        # for t in contents:
        #    logger.info(t["data"])
        #    logger.info("")
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "" or desc_helper.check_desc(brief,2) is False:
            brief = util.get_brief_from_news(dcontents)
        # post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post

        post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Example #30
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        title = d('div.da-title> h2').text().strip()
        if title.find("融资") >= 0:
            type = 60001
            category = 60101

        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time = d("span.article-time").eq(0).text().strip()

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.data-article').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("btm地址") >= 0 or \
                    c["data"].find("版权声明") >= 0:
                continue

            if c["data"].find("8btctest1/custom/images") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
Example #31
0
def process(content, citykey, crawler):
    cnt = 0
    if has_content(content):
        DT = datetime.date.today()
        TODAY = datetime.datetime(DT.year, DT.month, DT.day)
        #logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8")))
        lis =  d('div.wrap> div> div> ul.ativities> li.item')
        for li in lis:
            c = pq(li)
            img = c('a> img').attr("src").strip().replace("|130w","")
            if img is not None:
                # logger.info("poster: %s", poster)
                # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
                (posturl, width, height) = parser_mysql_util.get_logo_id_new(img, download_crawler, SOURCE, key,
                                                                             "news")
                if posturl is not None:
                    poster = str(posturl)
                else:
                    poster = None
            title = c('h3.title> a').text()
            link = c('h3.title> a').attr("href")
            if link.find("http") ==-1:
                 continue
            key = link.split("/")[-1]
            key_int = int(key)
            location = c('div.intro> div.address').text()
            sponors = c('div.intro> div.sponors> span').text().replace(","," ").replace(","," ").split()
            spans = c('div.intro> div.time> span')
            if len(spans) == 3:
                date = c('div.intro> div.time> span').eq(0).text()
                times = c('div.intro> div.time> span').eq(2).text().split("~")
                beginTime = date+" "+times[0]
                endTime = date+" "+times[1]
            elif len(spans) == 5:
                date = c('div.intro> div.time> span').eq(0).text()
                year = date.split("-")[0]
                times = c('div.intro> div.time> span').eq(2).text().split("~")
                beginTime = date+" "+times[0]
                endTime = year+"-"+times[1]+" "+c('div.intro> div.time> span').eq(4).text()
            else:
                continue
            try:
                beginDate = datetime.datetime.strptime(beginTime, "%Y-%m-%d %H:%M")
                endDate = datetime.datetime.strptime(endTime, "%Y-%m-%d %H:%M")
            except:
                beginDate = None

            if beginDate is None or beginDate < TODAY or key_int is None:
                # Not save active activity
                continue

            result = crawler.crawl(link)
            while True:
                if result['get'] == 'success':
                    break
                else:
                    result = crawler.crawl(link)
            if has_content(result['content']):
                contents = extract.extractContents(link, result['content'])
                flag, domain = url_helper.get_domain(link)
                dact = {
                    "beginDate": beginDate - datetime.timedelta(hours=8),
                    "endDate": endDate - datetime.timedelta(hours=8),
                    "date": beginDate - datetime.timedelta(hours=8),
                    "title": title,
                    "link": link,
                    "createTime": datetime.datetime.now(),
                    "source": SOURCE,
                    "key": key,
                    "key_int": key_int,
                    "type": TYPE,
                    "original_tags": [],
                    "processStatus": 0,
                    "companyIds": [],
                    "location": location,
                    "city": citymap[citykey],
                    "sponors": sponors,
                    "post": poster,
                    "domain": domain,
                    "categoryNames": []
                }
                dcontents = []
                rank = 1
                for c in contents:
                    if c["type"] == "text":
                        if c["data"].find("我要报名") >= 0:
                            logger.info("************************over")
                            break
                        dc = {
                            "rank": rank,
                            "content": c["data"],
                            "image": "",
                            "image_src": "",
                        }
                    else:
                        # dc = {
                        #     "rank": rank,
                        #     "content": "",
                        #     "image": "",
                        #     "image_src": c["data"],
                        # }
                        (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"],download_crawler, SOURCE,
                                                                                    key, "news")
                        if imgurl is not None:
                            dc = {
                                "rank": rank,
                                "content": "",
                                "image": str(imgurl),
                                "image_src": "",
                                "height": int(height),
                                "width": int(width)
                            }
                        else:
                            continue
                    dcontents.append(dc)
                    rank += 1
                dact["contents"] = dcontents
                value = activity_simhash.get_simhash_value(dcontents)
                dact["simhashValue"] = value

                record = collection_news.find_one({"source": SOURCE, "key_int": key_int})
                if record is not None:
                    city = record["city"]
                    if record["beginDate"] == dact["beginDate"] and record["endDate"] == dact["endDate"] and record["title"] == dact["title"] and record["city"] == citymap[citykey] and record["location"] == dact["location"]:
                        logger.info("%s activity already existed", title)
                        cnt += 1
                        continue
                    else:
                        collection_news.delete_one({"source": SOURCE, "key_int": key_int})
                        if city != citymap[citykey]:
                            logger.info("%s has two city : %s and %s with location %s, something is wrong", title, city, citymap[citykey], location)
                            cnt += 1
                            continue

                        collection_news.insert(dact)
                        logger.info("%s, %s, %s->%s, %s, %s, %s, %s", key, title, beginDate, endDate, ":".join(sponors),location, link, img)
                else:
                    if activity_simhash.check_same_act(dact) is True:
                        pass
                    else:
                        collection_news.insert(dact)
                        logger.info("%s, %s, %s->%s, %s, %s, %s, %s", key, title, beginDate, endDate, ":".join(sponors), location, link, img)
                cnt += 1
                logger.info("************Done***************")
    logger.info("*******%s activities has been checked or recorded", cnt)
    return cnt