Esempio n. 1
0
def get_location_from_company_name(unicode_name):
    #上海测试公司
    #测试(上海)公司
    #北京测试上海分公司
    conn = db.connect_torndb()
    locations = list(conn.query("select locationName from location"))
    conn.close()
    m_location = None
    sub_location = None
    for location in locations:
        if unicode_name.startswith(location["locationName"]):
            m_location = location["locationName"]
            break

    if m_location is None:
        r = util.re_get_result(u"((.*))", unicode_name)
        if r is not None:
            location, = r
            conn = db.connect_torndb()
            l = conn.query("select * from location where locationName=%s",
                           location)
            conn.close()
            if l is not None:
                m_location = location

    r = util.re_get_result(u"(.*)分公司", unicode_name)
    if r is not None:
        name, = r
        for location in locations:
            if name.endswith(location["locationName"]):
                sub_location = location["locationName"]
                break
    return m_location, sub_location
Esempio n. 2
0
def handle_app_result(response, app):
    global total

    if response.error:
        logger.info("Error: %s, %s" % (response.error, response.request.url))
        if response.code == 302 or response.code == 301 or response.code == 500 or response.code == 0:
            pass
        else:
            http_client.fetch(response.request.url,
                              lambda r, app=app: handle_app_result(r, app),
                              request_timeout=10)
            return
    else:
        logger.info(response.request.url)
        try:
            #html = unicode(response.body,encoding="utf-8",errors='replace')
            html = response.body
            (download, ) = util.re_get_result('downTimes:"(.*?)"', html)
            (score, ) = util.re_get_result(
                '<div class="com-blue-star-num">(.*?)分</div>', html)
            score = float(score)
            save_download(app["companyId"], app["artifactId"], download, score)
            logger.info("download=%s, score=%s" % (download, score))
        except:
            traceback.print_exc()

    total -= 1

    if total <= 0:
        begin()
Esempio n. 3
0
def handle_app_result(response, app, url, apkname, retry=0):
    global total

    if response.error:
        logger.info("Error: %s, %s" % (response.error, response.request.url))
        if response.code == 302 or response.code == 301 or response.code == 500 or response.code == 0 or response.code == 403:
            logger.info("herereere")
            pass
        else:
            retry += 1
            if response.code == 403:
                if retry < 20:
                    http_client.fetch(
                        response.request.url,
                        lambda r, app=app, url=url, apkname=apkname, retry=
                        retry: handle_app_result(r, app, url, apkname, retry),
                        request_timeout=10)
                    return
                else:
                    pass
            else:
                http_client.fetch(
                    response.request.url,
                    lambda r, app=app, url=url, apkname=apkname, retry=retry:
                    handle_app_result(r, app, url, apkname, retry),
                    request_timeout=10)
                return
    else:
        logger.info(response.request.url)
        try:
            # Parser data for newupdates:
            #logger.info("%s->%s", apkname, url)
            myapp_parser.process(None, url, apkname, response.body)

            #html = unicode(response.body,encoding="utf-8",errors='replace')
            html = response.body
            (download, ) = util.re_get_result('downTimes:"(.*?)"', html)
            (score, ) = util.re_get_result(
                '<div class="com-blue-star-num">(.*?)分</div>', html)
            score = float(score)
            download = float(download)
            crawler_util.save_download(app["domain"], TYPE, download, score)
            logger.info("apkname=%s, download=%s, score=%s" %
                        (app["domain"], download, score))

        except:
            traceback.print_exc()

    total -= 1
    logger.info("total: %s", total)

    if total <= 0:
        begin()
Esempio n. 4
0
def handle_page(response):
    global total

    if response.error:
        logger.info("Error: %s, %s" % (response.error,response.request.url))
        request(response.request.url, handle_page)
    else:
        #logger.info(response.body)
        d = pq(response.body)
        apps = d('div#selectedcontent> div> ul> li')
        for app in apps:
            name = pq(app).text()
            app_url = pq(app)('a').attr('href')
            (app_id,) = util.re_get_result(r"id(\d*)",app_url)

            logger.info("%s %s %s" % (app_id, name, app_url))
            item = itunes_collection.find_one({"appId":app_id})
            if item is None:
                data = {
                    "appId":app_id,
                    "name":name,
                    "url":app_url,
                    "date":datetime.datetime.now()
                }
                itunes_collection.insert_one(data)

            if re.match(u'[\u4e00-\u9fa5]+',name):
                if item is None or item.has_key("html")==False:
                    total += 1
                    request(app_url, handle_html)

                if item is None or item.has_key("json")==False:
                    total += 1
                    api_url = "https://itunes.apple.com/cn/lookup?id=%s" % app_id
                    request(api_url, handle_json)

        if len(apps) > 10:
            #logger.info(response.request.url)
            result = util.re_get_result(r"page=(\d*)",response.request.url)
            if result != None:
                (strPage,) = result
                #logger.info(strPage)
                nextPage = str(int(strPage) + 1)
                url = response.request.url
                url = url.replace("page="+strPage, "page=" + nextPage)
                logger.info(url)
                total += 1
                request(url, handle_page)

        total -= 1
        if total <=0:
            exit(0)
Esempio n. 5
0
def login():
    while True:
        try:
            idx = random.randint(0, len(login_users)-1)
            login_user = login_users[idx]
            logger.info(login_user)

            data = {
                    "backurl":"	http://beian.links.cn",
                    "bsave": "1",
                    "opaction":"login",
                    "username":login_user["name"],
                    "password":login_user["pwd"],}

            s = my_request.get_http_session(new=True, agent=False)
            logger.info("proxies=%s" % s.proxies)
            r = s.post("http://my.links.cn/checklogin.asp",data=data, timeout=10)
            if r.status_code == 200:
                #html = util.html_encode_4_requests(r.text, r.content, r.encoding)
                r.encoding = r.apparent_encoding
                html = r.text
                #logger.info(html)
                if html is not None:
                    if util.re_get_result(r"(loaduserinfo)",html):
                        return True
        except Exception,ex:
            logger.exception(ex)

        time.sleep(10)
Esempio n. 6
0
def parse_member(item):
    if item is None:
        return None

    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    members = []
    # members
    logger.info("*** member ****")
    lis = d('ul.list-prodcase> li')
    for li in lis:
        l = pq(li)
        member_name = l('h4> a> b> span.c').text().strip()
        position = l('h4> a> b> span.c-gray').text().strip()
        str = l('h4> a').attr("href").strip()
        (member_key, ) = util.re_get_result(r'person/(\d*?)$', str)
        logger.info("member_key: %s, member_name: %s, position: %s" %
                    (member_key, member_name, position))
        member = {"key": member_key, "name": member_name, "position": position}
        members.append(member)

    logger.info("")
    return members
Esempio n. 7
0
def process(city, content):
    #logger.info(content)
    d = pq(html.fromstring(content.decode("utf-8")))
    lis = d('div.search-tab-content-item> div')
    for li in lis:
        key = None
        c = pq(li)
        url = c('a').eq(0).attr("href")
        result = util.re_get_result('/event/(\d+)', url)
        if result:
            key, = result
        if key is None:
            continue

        url = "http://www.huodongxing.com/event/%s" % key
        logger.info(url)
        maxretry = 0
        while True:
            result = crawler.crawl(url, agent=True)
            if result['get'] == 'success':
                break
            elif result['get'] == 'fail' and result["content"] is not None:
                logger.info(result["content"])
                if result["content"].find("系统载入中") > 0:
                    break
            if maxretry > 30:
                result["content"] = " "
                break
            maxretry += 1

        try:
            process_activity(key, result['content'])
        except Exception, ex:
            logger.exception(ex)
Esempio n. 8
0
def process_page(url, content):
    global page_urls

    d = pq(content)
    apps = d('div#selectedcontent> div> ul> li')
    for app in apps:
        name = pq(app).text()
        app_url = pq(app)('a').attr('href')
        (app_id, ) = util.re_get_result(r"id(\d+)", app_url)
        try:
            trackId = int(app_id)
        except Exception, e:
            logger.info(traceback.format_exc())
            logger.info(app_url)

        logger.info("%s %s %s" % (trackId, name, app_url))

        #if util.isChineseString(name):
        if 1 == 1:
            item = collection.find_one({"trackId": trackId})
            if item is None:
                data = {
                    "trackId": trackId,
                    "trackName": name,
                    "trackViewUrl": app_url,
                    "createTime": datetime.datetime.now()
                }
                collection.insert_one(data)
            else:
                data = {
                    "trackName": name,
                    "trackViewUrl": app_url,
                    "modifyTime": datetime.datetime.now()
                }
                collection.update_one({"trackId": trackId}, {'$set': data})
Esempio n. 9
0
 def is_crawl_success(self, url, content):
     if content.find('操作成功') == -1:
         logger.info(content)
         return False
     r = "companyId=(.*?)&pageSize"
     result = util.re_get_result(r, url)
     (id, ) = result
     try:
         j = json.loads(content)
         rjobs = j['content']['data']['page']['result']
         if len(rjobs) == 0:
             logger.info("Failed due to 0 jobs under url: %s", url)
             return False
         if len(rjobs) > 0 and rjobs[0].has_key("companyId"):
             companyId = rjobs[0]["companyId"]
             logger.info("Url companyid: %s <-> lagou return companyId: %s",
                         id, companyId)
             if str(companyId) != id:
                 logger.info(
                     "Failed due to different companyId: got: %s from request :%s",
                     companyId, url)
                 return False
         return True
     except:
         return True
Esempio n. 10
0
def handle_alexa_cn_result(content, domain, crawler_cn):
    try:
        d = pq(content)
        data = d('script').text()
        data = ''.join(data)
        try:
            (ids,) = util.re_get_result("showHint\('(\S*)'\);", data)
        except:
            traceback.print_exc()
            # logger.info(html)
            return None

        id_arr = ids.split(',')

        data = {"url": id_arr[0],
                "sig": id_arr[1],
                "keyt": id_arr[2]
                }
        body = urllib.urlencode(data)
        url = "http://www.alexa.cn/api_150710.php"
        result = crawler_cn.crawl(url,postdata=body)
        if result['get'] == 'success':
            #logger.info(result["content"])
            data_cn = handle_api_result(result["content"], domain)
            return data_cn
    except:
        traceback.print_exc()
    return None
Esempio n. 11
0
def fetch_alexa(domain):

    alexa = trends_tool.get_alexa(domain)

    url = 'http://www.alexa.cn/index.php?url='+domain
    proxy = {'type': 'http', 'anonymity':'high', 'country': 'cn', 'ping': 5}
    while True:
        s = my_request.get_single_session(proxy, new=True, agent=False)
        (flag, r) = my_request.get(logger, url)
        if flag == 0:
            break

    d = pq(r.text)
    data = d('script').text()
    data = ''.join(data)
    (ids, ) = util.re_get_result("showHint\('(\S*)'\);", data)
    id_arr = ids.split(',')

    domain = id_arr[0]
    while True:
        timeout = 10
        try:
            r = s.post("http://www.alexa.cn/api_150710.php",
                       data={"url": id_arr[0],
                             "sig": id_arr[1],
                             "keyt":id_arr[2]
                             },
                       timeout=timeout)
            break
        except Exception,ex:
            logger.exception(ex)
            timeout = 20
Esempio n. 12
0
def process(content, sourceId, source, key):
    r = "var  = (.*?);"

    result = util.re_get_result(r, content)
    (b, ) = result
    logger.info(b)
    c = b.decode("gbk", "ignore")
    j = json.loads(c)
    infos = j["data"]
    mongo = db.connect_mongo()

    collection = mongo.stock.announcement
    for info in infos:
        ntitle = info["NOTICETITLE"]
        ndate = info["NOTICEDATE"]
        nurl = info["Url"]
        cleantitle = ntitle.replace(":", "").replace(str(sourceId), "").strip()
        logger.info("%s-%s-%s", ntitle, cleantitle, ndate)

        item = collection.find_one({"title": cleantitle})
        item1 = collection.find_one({"title": ntitle})
        if item is not None or item1 is not None:
            logger.info("******already exists")
        else:
            logger.info("******missing, get it")
            crawler_rp(nurl, cleantitle, ndate, sourceId, source)

    if j.has_key("TotalCount") and j["TotalCount"] > (50 * key):
        cnt = 1
    else:
        cnt = 0
    mongo.close()

    return cnt
Esempio n. 13
0
def get_pages(session, cookies, fullname):
    page_result = {'status': None, 'pages': None, 'proxies': None}
    code = get_verify_code(session, cookies)
    if code is None:
        page_result['status'] = 'nocode'
        return page_result
    time.sleep(2)
    # 搜索
    res = 0
    while True:
        res += 1
        if res > 10:
            page_result['status'] = 'nopage'
            return page_result

        try:
            unitname = fullname.encode('gb2312', 'ignore')
            search_url = "http://www.miitbeian.gov.cn/icp/publish/query/icpMemoInfo_searchExecute.action"
            payload = {
                "siteName": "",
                "condition": "5",
                "siteDomain": '',
                "siteUrl": "",
                "mainLicense": "",
                "siteIp": "",
                "unitName": unitname,
                "mainUnitNature": "-1",
                "certType": "-1",
                "mainUnitCertNo": "",
                "verifyCode": code
            }
            proxies = get_proxy('http')
            r = session.post(search_url,
                             data=payload,
                             headers=headers,
                             cookies=cookies,
                             proxies=proxies,
                             timeout=10)
            content = r.text

            if content.find("备案信息查询") >= 0:
                if content.find('没有符合条件的记录') == -1:
                    result = util.re_get_result(r"1/(\d+)", content)
                    if result is not None:
                        pages, = result
                        if pages is not None:
                            page_result['status'] = 'got'
                            page_result['pages'] = pages
                            page_result['proxies'] = proxies
                            return page_result
                        else:
                            page_result['status'] = 'nofind'
                    else:
                        page_result['status'] = 'nofind'
                else:
                    page_result['status'] = 'nofind'
                return page_result
        except:
            pass
Esempio n. 14
0
def fetch_bp(html, referer, cf_key):
    d = pq(html)
    script = d('script').text()
    script = ''.join(script)
    try:
        (pdf_key, ) = util.re_get_result("pptKey = \"(\S+)\"", script)
    except Exception, e:
        return None
Esempio n. 15
0
def baidu_get_actual_link(url):
    r = requests.get(url)
    html = r.content
    r = util.re_get_result("URL='(.*?)'", html)
    if r is None:
        return None
    url, = r
    return url
Esempio n. 16
0
def handle_json(response):
    global total
    if response.error:
        logger.info("Error: %s, %s" % (response.error,response.request.url))
        request(response.request.url, handle_json)
    else:
        (app_id,) = util.re_get_result(r"id=(\d*)",response.request.url)
        itunes_collection.update_one({"appId":app_id},{'$set':{'json':response.body}})

        total -= 1
        if total <=0:
            exit(0)
Esempio n. 17
0
def process(search_name, from_doc_id, content):
    d = pq(util.html_encode(content))

    divs = d('div.app')
    for div in divs:
        e = pq(div)
        a = e('a.app-name')
        name = a.text().strip()
        #logger.info(name)
        href = a.attr("href")
        #logger.info(href)
        result = util.re_get_result("docid=(\d*)",href)
        if result:
            (docid_str,) = result
            try:
                docid = long(docid_str)
            except:
                continue
        else:
            continue

        data = e('a.inst-btn')
        if len(data) == 0:
            data = e('a.inst-btn-big')
        if len(data) == 0:
            continue
        type = data.attr("data_detail_type")
        apkname = data.attr("data_package")
        version = data.attr("data_versionname")
        size = None
        try:
            size = long(data.attr("data_size"))
        except:
            pass

        item = {
            "key_int": docid,
            "search_name": search_name,
            "name": name,
            "link": "http://shouji.baidu.com/software/%s.html" % docid,
            "type": type,
            "apkname": apkname,
            "version": version,
            "size": size
        }
        #logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

        try:
            android.save_baidu_search(collection_search, item)
        except Exception,e:
            logger.info(e)
Esempio n. 18
0
def get_id(session, cookies, domain):
    fid = None
    id_result = {'status': None, 'id': fid, 'proxies': None}
    code = get_verify_code(session, cookies)
    if code is None:
        id_result['status'] = 'wrong'
        return id_result

    res = 0
    while True:
        res += 1
        if res > 10:
            id_result['status'] = 'wrong'
            return id_result

        # 搜索
        search_url = "http://www.miitbeian.gov.cn/icp/publish/query/icpMemoInfo_searchExecute.action"
        payload = {
            "siteName": "",
            "condition": "1",
            "siteDomain": domain,
            "siteUrl": "",
            "mainLicense": "",
            "siteIp": "",
            "unitName": "",
            "mainUnitNature": "-1",
            "certType": "-1",
            "mainUnitCertNo": "",
            "verifyCode": code
        }
        proxies = get_proxy('http')
        r = session.post(search_url,
                         data=payload,
                         headers=headers,
                         cookies=cookies,
                         proxies=proxies)
        content = r.text
        if content.find('备案信息查询') >= 0:
            if content.find('没有符合条件的记录') == -1:
                result = util.re_get_result(r"doDetail\('(.*?)'\)", content)
                if result is not None:
                    _id, = result
                    print("id: %s" % _id)
                    id_result['status'] = 'got'
                    id_result['id'] = _id
                    id_result['proxies'] = proxies
                    return id_result
            else:
                print('该 domain 未备案...')
                id_result['status'] = 'noid'
                return id_result
Esempio n. 19
0
    def is_crawl_success(self, url, content, redirect_url):
        if content.find("var") >= 0:
            r = "var  = (.*?);"

            result = util.re_get_result(r, content)
            (b, ) = result
            logger.info(b)
            try:
                c = b.decode("gbk", "ignore")
                j = json.loads(c)
                if j.has_key("data") is True:
                    return True
                else:
                    return False
            except Exception, E:
                logger.info("here")
                logger.info(E)
Esempio n. 20
0
def parse_footprint(item):
    if item is None:
        return []

    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    footprints = []
    #footprint
    logger.info("*** footprint ***")
    lis = d('ul.list-milestone> li')
    for li in lis:
        l = pq(li)
        footDesc = l('div> p').eq(0).text().strip()
        logger.info(footDesc)
        if footDesc is None or footDesc == "":
            continue
        footDateText = l('div> p> span.t-small').text().strip()
        logger.info(footDateText)
        if footDateText is None or footDateText == "":
            continue
        result = util.re_get_result('(\d*?)\.(\d*?)$', footDateText)
        if result == None:
            continue
        (year, month) = result
        year = int(year)
        try:
            month = int(month)
        except:
            month = 1

        if month <= 0 or month > 12:
            month = 1
        if year < 1970 or year > 3000:
            year = 1970
        footDate = datetime.datetime.strptime("%s-%s-1" % (year, month),
                                              '%Y-%m-%d')
        logger.info("%s: %s", footDate, footDesc)
        footprint = {"footDate": footDate, "footDesc": footDesc}
        footprints.append(footprint)
    logger.info("")
    return footprints
Esempio n. 21
0
def process(content, keyword, link):
    # j = json.loads(content)
    # infos = j["value"]
    logger.info(content)
    cnt = 0
    d = pq(html.fromstring(content.decode("utf-8")))
    title = d('head> title').text().strip()
    logger.info("title: %s", title)

    ptype = None

    mongo = db.connect_mongo()
    collection = mongo.trend.index
    if link.find("MEDIA_WECHAT") >= 0:
        source = 13651  #搜狗微信热度
        sourceDesc = "搜狗微信热度"
        if collection.find_one({"source": source, "keyword": keyword}) is None:
            ptype = 1
        else:
            ptype = 2

    if ptype is None: return

    if ptype == 1:
        logger.info("here")
        r = "root.SG.wholedata\s=\s(.*)?\;.*\}\(this"
    else:
        r = "root.SG.data = (.*?);root.SG.wholedata"
    try:
        result = util.re_get_result(r, content)
    except:
        logger.info("wwwwwww")
        return

    logger.info(result)
    (b, ) = result

    logger.info(b)
    base = json.loads(b, strict=False)
    for pv in base["pvList"]:
        logger.info(json.dumps(pv, ensure_ascii=False, cls=util.CJsonEncoder))

    mongo.close()
    return cnt
Esempio n. 22
0
def handle_alexa_cn_result(response, app):
    global total
    if response.error:
        logger.info("Error: %s, %s" % (response.error, response.request.url))
        request(response.request.url,
                lambda r, app=app: handle_alexa_cn_result(r, app))
        return
    else:
        try:
            html = unicode(response.body, encoding="utf-8", errors='replace')
            d = pq(html)
            data = d('script').text()
            data = ''.join(data)
            try:
                (ids, ) = util.re_get_result("showHint\('(\S*)'\);", data)
            except:
                # logger.info(html)
                request(response.request.url,
                        lambda r, app=app: handle_alexa_cn_result(r, app))
                return

            id_arr = ids.split(',')

            data = {"url": id_arr[0], "sig": id_arr[1], "keyt": id_arr[2]}
            body = urllib.urlencode(data)
            url = "http://www.alexa.cn/api_150710.php"
            total += 1
            # proxy_ip = get_proxy()
            proxy_ip = None
            request(url,
                    lambda r, app=app, body=body, proxy_ip=proxy_ip:
                    handle_api_result(r, app, body, proxy_ip),
                    body,
                    proxy_ip)
        except:
            traceback.print_exc()

    total -= 1
    if total <= 0:
        begin()
Esempio n. 23
0
def fetch(url):
    (key, ) = util.re_get_result("https://itjuzi.com/album/(\d+)", url)
    logger.info("key=%s" % key)

    (flag, r) = my_request.get(logger, url)
    logger.info("flag=%d", flag)

    if flag == -1:
        return -1

    if r.status_code == 404:
        logger.info("Page Not Found!!!")
        return r.status_code

    if r.status_code != 200:
        return r.status_code

    if r.url != url:
        logger.info("Page Redirect <--")
        return 302

    content = {
        "date": datetime.datetime.now(),
        "url": url,
        "key": key,
        "content": r.text
    }

    # save
    if collection.find_one({"key": key}) != None:
        collection.delete_one({"key": key})
    collection.insert_one(content)

    # msg = {"type":"itjuzi_album", "key":key}
    # logger.info(json.dumps(msg))
    # kafka_producer.send_messages("itjuzi_album", json.dumps(msg))

    return 200
Esempio n. 24
0
def fetch_project(url):
    (cf_key, ) = util.re_get_result("http://dj.jd.com/funding/details/(\d+).html", url)
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        html = r.text
        support = fetch_support(cf_key)
        focus = fetch_focus(url, cf_key)
        team = fetch_team(cf_key)
        leader = fetch_leader(cf_key)

        bp = fetch_bp(html, url, cf_key)


        content = {'html': html,
                   'team': team,
                   'support': support,
                   'focus': focus
                   }

        project = {"date":datetime.datetime.now(),
                   "source":source,
                   "url":url,
                   "company_key": cf_key,
                   "cf_key":cf_key,
                   "content":content,
                   'leader': leader,
                   'bp': bp
                   }

        result = cf_collection.find_one({"source":source, "company_key":cf_key, 'cf_key': cf_key})
        if result != None:
            cf_collection.replace_one({'_id': result['_id']}, project)
        else:
            cf_collection.insert_one(project)

        msg = {"type":"cf", "source":source, "cf_key":cf_key}
        logger.info(json.dumps(msg))
        kafka_producer.send_messages("crawler_cf_jd_v2", json.dumps(msg))
Esempio n. 25
0
def parse_member(item):
    if item is None:
        return []

    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    members = []
    # members
    logger.info("*** member ****")
    lis = d('ul.list-prodcase> li')
    for li in lis:
        try:
            l = pq(li)
            member_name = l('h4> a> b> span.c').text().strip()
            position = l('h4> a> b> span.c-gray').text().strip()
            str = l('h4> a').attr("href").strip()
            (member_key, ) = util.re_get_result(r'person/(\d*?)$', str)
            logger.info("member_key: %s, member_name: %s, position: %s" %
                        (member_key, member_name, position))
            memberId = parser_mongo_util.find_mongo_memberId(
                SOURCE, member_key)
            if memberId is None:
                continue
            type = name_helper.position_check(position)
            member = {
                "_memberId": memberId,
                "name": member_name,
                "position": position,
                "type": type
            }
            members.append(member)
        except Exception, ex:
            logger.exception(ex)
Esempio n. 26
0
def getMoney(moneyStr):
    investment = 0
    currency = 3020
    precise = 'Y'

    investmentStr = ""

    if investment == 0:
        result = util.re_get_result(u'(数.*?)万人民币',moneyStr)
        if result != None:
            (investmentStr,) = result
            currency = 3020
            precise = 'N'
        else:
            result = util.re_get_result(u'(数.*?)万美元',moneyStr)
            if result != None:
                (investmentStr,) = result
                currency = 3010
                precise = 'N'

        if investmentStr != "":
            if investmentStr == u"数":
                investment = 1*10000
            elif investmentStr == u"数十":
                investment = 10*10000
            elif investmentStr == u"数百":
                investment = 100*10000
            elif investmentStr == u"数千":
                investment = 1000*10000

    if investment == 0:
        result = util.re_get_result(u'(数.*?)亿人民币',moneyStr)
        if result != None:
            (investmentStr,) = result
            currency = 3020
            precise = 'N'
        else:
            result = util.re_get_result(u'(数.*?)亿美元',moneyStr)
            if result != None:
                (investmentStr,) = result
                currency = 3010
                precise = 'N'

        if investmentStr != "":
            if investmentStr == u"数":
                investment = 1*10000*10000
            elif investmentStr == u"数十":
                investment = 10*10000*10000
            elif investmentStr == u"数百":
                investment = 100*10000*10000
            elif investmentStr == u"数千":
                investment = 1000*10000*10000

    if investment == 0:
        result = util.re_get_result(u'(\d*\.?\d*?)万人民币',moneyStr)
        if result != None:
            (investmentStr,) = result
            currency = 3020
            investment = int(float(investmentStr) * 10000)
        else:
            result = util.re_get_result(u'(\d*\.?\d*?)万美元',moneyStr)
            if result != None:
                (investmentStr,) = result
                currency = 3010
                investment = int(float(investmentStr) * 10000)

    if investment == 0:
        result = util.re_get_result(u'(\d*\.?\d*?)亿人民币',moneyStr)
        if result != None:
            (investmentStr,) = result
            currency = 3020
            investment = int(float(investmentStr) * 100000000)
        else:
            result = util.re_get_result(u'(\d*\.?\d*?)亿美元',moneyStr)
            if result != None:
                (investmentStr,) = result
                currency = 3010
                investment = int(float(investmentStr) * 100000000)

    if investment == 0:
        result = util.re_get_result(u'亿元及以上美元',moneyStr)
        if result != None:
            currency = 3020
            investment = 100000000
            precise = 'N'
        else:
            result = util.re_get_result(u'亿元及以上人民币',moneyStr)
            if result != None:
                currency = 3010
                investment = 100000000
                precise = 'N'

    return currency, investment, precise
Esempio n. 27
0
def process_news(item, url, content):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("gbk")))

        title = d(
            'div.g-main> div> div.m-cont-hd> div.title> h1').text().strip()
        datecontent = d(
            'div.g-main> div> div.m-cont-hd> div.m-info> div> div> div.box> div.origin'
        ).text().strip()
        result = util.re_get_result('(\d{4}\/.*?)$', datecontent)
        if result:
            post_time, = result
            news_time = datetime.datetime.strptime(post_time,
                                                   "%Y/%m/%d %H:%M:%S")
        else:
            post_time = None
            news_time = None

        key = item["key"]
        column = d('div.g-main> div> div.m-cont-hd> div.tag').text().strip()
        brief = d('div.g-article> div> div.review').text().strip()
        postraw = item["post"]
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        if column is not None:
            tags = column.split()
        else:
            tags = []

        logger.info("%s, %s, %s, %s, %s, %s", key, title, post_time, news_time,
                    brief, ":".join(tags))
        article = d('div.g-article> div.m-article').html()
        #logger.info(article)
        contents = extract.extractContents(url, article)

        if collection_news.find_one({"link": url}) is not None:
            return
            # collection_news.delete_one({"link": url})
        #
        # for t in contents:
        #    logger.info(t["data"])
        #    logger.info("")
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": None,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_posterId_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        # collection_news.insert(dnews)
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
Esempio n. 28
0
def save_itunes(response, data):
    global total
    if response.error:
        logger.info("Error: %s, %s" % (response.error, response.request.url))
        # request(response.request.url, lambda r, data=data: save_itunes(r,data))
        # return
    else:
        try:
            html = response.body
            d = pq(html)
            developer = d(".product-header__identity> a").text()
            if developer is not None:
                developer = developer.replace("开发商:", "")
            data["developer"] = developer

            supportUrl = None
            links = d('li.t-subbody>a.targeted-link.link.icon')
            for i in links:
                title = pq(i).text().strip()
                if title.endswith("支持"):
                    supportUrl = pq(i).attr('href').strip()
            data["supportUrl"] = url_helper.url_normalize(supportUrl)

            logger.info("********************Developer: %s->supportUrl: %s",
                        data["developer"], data["supportUrl"])

            relatedApps = []
            try:
                # divs = d('div.swoosh')
                # for div in divs:
                #     e = pq(div)
                #     if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有":
                #         apps = e('div.content> div> div.application')
                #         for app in apps:
                #             app_id = pq(app).attr('adam-id')
                #             relatedApps.append(int(app_id))
                #logger.info("*********************%s", app_id)
                apps = d('div.l-row.l-row--peek> a')
                for app in apps:
                    appurl = pq(app).attr('href')
                    r = util.re_get_result('/id(\d*)', appurl)
                    if r is not None:

                        track_id, = r
                        try:
                            app_id = int(track_id)
                            relatedApps.append(int(app_id))
                        except:
                            pass
            except:
                pass
            logger.info("*********************%s", relatedApps)
            data["relatedApps"] = relatedApps

            userComments = []
            cdivs = d('div.l-row.l-row--peek> div.ember-view')
            for cdiv in cdivs:
                c = pq(cdiv)
                try:
                    c_title = c(
                        'div.we-customer-review> div.we-customer-review__header> h3'
                    ).eq(1).text().strip()
                    c_commentator = c('div.we-customer-review__user').eq(
                        1).text().replace("评论人:", "").strip()
                    c_content = c('p.we-customer-review__body').attr(
                        "aria-label")

                    comment = {
                        "title": c_title,
                        "commentator": c_commentator,
                        "content": c_content
                    }
                    userComments.append(comment)

                except:
                    pass

            logger.info(
                json.dumps(userComments,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            data["userComments"] = userComments

            if data["supportUrl"] is not None:
                flag, domain = url_helper.get_domain(data["supportUrl"])
                if flag:
                    data["supportDomain"] = domain
                else:
                    data["supportDomain"] = None
            if data.has_key("sellerUrl") and data["sellerUrl"] is not None:
                data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"])
                flag, domain = url_helper.get_domain(data["sellerUrl"])
                if flag:
                    data["sellerDomain"] = domain
                else:
                    data["sellerDomain"] = None

            short_name = name_helper.get_short_name(data["trackName"])
            data["trackShortName"] = short_name
            logger.info(
                json.dumps(data, ensure_ascii=False, cls=util.CJsonEncoder))

            record = collection_itunes.find_one(
                {"trackId": data["trackId"]}, projection={'histories': False})
            if record:
                _id = record.pop("_id")
                if LooseVersion(data["version"]) > LooseVersion(
                        record["version"]):
                    data["createTime"] = record["createTime"]
                    data["modifyTime"] = datetime.datetime.now()
                    collection_itunes.update_one({"_id": _id}, {
                        '$set': data,
                        '$addToSet': {
                            "histories": record
                        }
                    })
                # elif LooseVersion(data["version"]) == LooseVersion(record["version"]):
                #     data["modifyTime"] = datetime.datetime.now()
                #     collection_itunes.update_one({"_id": _id}, {'$set': data})
            else:
                data["createTime"] = datetime.datetime.now()
                data["modifyTime"] = data["createTime"]
                collection_itunes.insert(data)

        except:
            traceback.print_exc()

    total -= 1
    if total <= 0:
        begin()
Esempio n. 29
0
File: kr36.py Progetto: yujiye/Codes
def fetch_company(url):
    (company_key, ) = util.re_get_result("http://rong.36kr.com/api/company/(\d+)", url)
    logger.info("company_key=%s" % company_key)

    company_content = None
    member_contents = []
    news_contents = []
    investor_contents = []
    member_ids = []

    #company base info
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1

    if r.status_code == 404:
        logger.info("Page Not Found!!!")
        return r.status_code

    if r.status_code != 200:
        logger.info("status_code=%d" % r.status_code)
        return r.status_code

    company_base = r.json()
    logger.info(company_base)

    if company_base["code"] != 0:
        return 404

    logger.info(company_base["data"]["company"]["name"])

    #past-finance (investment events)
    url = "http://rong.36kr.com/api/company/%s/past-finance" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    past_finance = r.json()

    #past-investor
    url = "http://rong.36kr.com/api/company/%s/past-investor?pageSize=100" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    past_investor = r.json()

    #funds (非投资人没有查看权限)
    url = "http://rong.36kr.com/api/company/%s/funds" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    funds = r.json()

    #product
    url = "http://rong.36kr.com/api/company/%s/product" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    product = r.json()

    #past-investment
    url = "http://rong.36kr.com/api/company/%s/past-investment" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    past_investment = r.json()

    #company-fa?
    url ="http://rong.36kr.com/api/fa/company-fa?cid=%s" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    company_fa = r.json()

    #founders
    url = "http://rong.36kr.com/api/company/%s/founder?pageSize=1000" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    founders = r.json()

    #employee
    url ="http://rong.36kr.com/api/company/%s/employee?pageSize=1000" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    employees = r.json()

    #former-member
    url = "http://rong.36kr.com/api/company/%s/former-member?pageSize=1000" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    former_members = r.json()

    company_content = {"date":datetime.datetime.now(), "source":source, "url":url, "company_key":company_key,
                       "company_key_int":int(company_key),
                       "company_base":company_base,
                       "past_finance":past_finance,
                       "past_investor":past_investor,
                       "funds":funds,
                       "product":product,
                       "past_investment":past_investment,
                       "company_fa":company_fa,
                       "founders":founders,
                       "employees":employees,
                       "former_members":former_members}

    #member
    for m in founders["data"]["data"]:
        m_id = m["id"]
        member_ids.append(m_id)
    for m in employees["data"]["data"]:
        m_id = m["id"]
        member_ids.append(m_id)
    for m in former_members["data"]["data"]:
        m_id = m["id"]
        member_ids.append(m_id)
    for v in past_investor["data"]["data"]:
        if v["entityType"] == "INDIVIDUAL":
            m_id = v["entityId"]
            member_ids.append(m_id)

    for m_id in member_ids:
        member_key = str(m_id)

        if member_collection.find_one({"source":source, "member_key":member_key}):
            continue

        #basic
        url = "http://rong.36kr.com/api/user/%s/basic" % member_key
        time.sleep(5)
        (flag, r) = my_request.get(logger, url)
        if flag == -1:
            return -1
        member_base = r.json()

        #past-investment
        url = "http://rong.36kr.com/api/user/%s/past-investment" % member_key
        time.sleep(5)
        (flag, r) = my_request.get(logger, url)
        if flag == -1:
            return -1
        member_past_investment = r.json()

        #
        url = "http://rong.36kr.com/api/user/%s/company" % member_key
        time.sleep(5)
        (flag, r) = my_request.get(logger, url)
        if flag == -1:
            return -1
        member_company = r.json()

        #
        url = "http://rong.36kr.com/api/user/%s/work" % member_key
        time.sleep(5)
        (flag, r) = my_request.get(logger, url)
        if flag == -1:
            return -1
        member_work = r.json()

        #
        url = "http://rong.36kr.com/api/p/lead-investor/%s/financing" % member_key
        time.sleep(5)
        (flag, r) = my_request.get(logger, url)
        if flag == -1:
            return -1
        member_financing = r.json()

        member_content = {"date":datetime.datetime.now(), "source":source, "url":url, "member_key":member_key,
                          "member_base":member_base,
                          "member_past_investment":member_past_investment,
                          "member_company":member_company,
                          "member_work":member_work,
                          "member_financing":member_financing}
        member_contents.append(member_content)

    #investor organization
    for e in past_finance["data"]["data"]:
        for investor in e.get("participants",{}):
            investor_key = str(investor["entityId"])

            if investor_collection.find_one({"source":source, "investor_key":investor_key}):
                continue

            #base info
            url = "http://rong.36kr.com/api/organization/%s/basic" % investor_key
            time.sleep(5)
            (flag, r) = my_request.get(logger, url)
            if flag == -1:
                return -1
            investor_base = r.json()

            #staffs
            url = "http://rong.36kr.com/api/organization/%s/user" % investor_key
            time.sleep(5)
            (flag, r) = my_request.get(logger, url)
            if flag == -1:
                return -1
            staffs = r.json()

            #former-member
            url = "http://rong.36kr.com/api/organization/%s/former-member" % investor_key
            time.sleep(5)
            (flag, r) = my_request.get(logger, url)
            if flag == -1:
                return -1
            former_members = r.json()

            investor_content = {"date":datetime.datetime.now(), "source":source, "url":url, "investor_key":investor_key,
                                "investor_base":investor_base,
                                "staffs":staffs,
                                "former_members":former_members}

            investor_contents.append(investor_content)

    #logger.info(company_content)
    #logger.info("************")
    #logger.info(member_contents)
    #logger.info("************")
    #logger.info(investor_contents)

    #save
    if company_collection.find_one({"source":source, "company_key":company_key}) != None:
        company_collection.delete_one({"source":source, "company_key":company_key})
    company_collection.insert_one(company_content)

    for member in member_contents:
        if member_collection.find_one({"source":source, "member_key":member["member_key"]}) == None:
            member_collection.insert_one(member)

    for news in news_contents:
        if news_collection.find_one({"source":source, "company_key":company_key, "news_key":news["news_key"]}) == None:
            news_collection.insert_one(news)

    for investor in investor_contents:
        if investor_collection.find_one({"source":source, "investor_key":investor["investor_key"]}) == None:
            investor_collection.insert_one(investor)

    msg = {"type":"company", "source":source, "company_key":company_key}
    logger.info(json.dumps(msg))
    kafka_producer.send_messages("crawler_kr36_v2", json.dumps(msg))

    return 200
Esempio n. 30
0
File: 360.py Progetto: yujiye/Codes
def handle_app_result(response, app):
    global total

    if response.error:
        logger.info("Error: %s, %s" % (response.error, response.request.url))
        #http_client.fetch(response.request.url, lambda r,app=app:handle_app_result(r, app),request_timeout=10)
        request(response.request.url,
                lambda r, app=app: handle_app_result(r, app))
        return
    else:
        logger.info(response.request.url)
        try:
            html = unicode(response.body, encoding="utf-8", errors='replace')
            #logger.info(html)
            d = pq(html)
            downloadstr = d("span.s-3").eq(0).text().replace(
                "下载:", "").replace("次", "").replace("+", "").strip()
            download = 0
            score = 0
            try:
                if downloadstr.endswith("千"):
                    download = float(downloadstr.replace("千", "")) * 1000
                elif downloadstr.endswith("万"):
                    download = float(downloadstr.replace("万", "")) * 10000
                elif downloadstr.endswith("亿"):
                    download = float(downloadstr.replace("亿",
                                                         "")) * 10000 * 10000
                else:
                    download = int(downloadstr)
                score = float(d("span.s-1").text().replace("分",
                                                           "").strip()) * 0.5
            except:
                pass

            r = "var detail = \(function \(\) \{\s*?return\s*?(.*?);\s*?\}\)"
            result = util.re_get_result(r, html)

            if result is not None:
                (b, ) = result
                base = json.loads(b.replace("'", '"'), strict=False)
                baike_name = base["baike_name"].strip()
                save_download(app["companyId"], app["artifactId"], download,
                              score)
                logger.info(
                    "companyId=%s, artifactId=%s, download=%s, score=%s, baike_name=%s"
                    % (app["companyId"], app["artifactId"], download, score,
                       baike_name))

                url = "http://zhushou.360.cn/search/index/?kw=%s" % urllib.quote(
                    app["name"].encode("utf-8"))
                total += 1
                #http_client.fetch(url, lambda r,app=app:handle_search_result(r, app),request_timeout=10)
                request(url, lambda r, app=app: handle_search_result(r, app))

                url = "http://intf.baike.360.cn/index.php?name=%s&c=message&a=getmessagenum" % urllib.quote(
                    baike_name.encode("utf-8"))
                total += 1
                #http_client.fetch(url, lambda r,app=app:handle_comment_result(r, app),request_timeout=10)
                request(url, lambda r, app=app: handle_comment_result(r, app))
            else:
                logger.info(html)
        except:
            traceback.print_exc()

    total -= 1

    if total <= 0:
        begin()