Ejemplo n.º 1
0
def login():
    retry_times = 0

    while True:
        try:
            idx = random.randint(0, len(login_users) - 1)
            login_user = login_users[idx]
            logger.info(login_user)

            flag = -1
            while flag != 0:
                s = my_request.get_https_session(new=True, agent=True)
                (flag,
                 r) = my_request.get(logger,
                                     "https://www.itjuzi.com/user/login")
                logger.info(r.status_code)
                if flag == 0 and r.status_code != 200:
                    flag = -1
            logger.info(r.headers["Set-Cookie"])
            r = s.post("https://www.itjuzi.com/user/login",
                       data={
                           "identity": login_user["name"],
                           "password": login_user["pwd"]
                       },
                       timeout=10)
            logger.info(r.headers["Refresh"])
            if "0;url=https://www.itjuzi.com/" == r.headers["Refresh"]:
                return True
        except Exception, ex:
            logger.exception(ex)
            time.sleep(10)
        '''
Ejemplo n.º 2
0
def fetch_alexa(domain):

    alexa = trends_tool.get_alexa(domain)

    url = 'http://www.alexa.cn/index.php?url='+domain
    proxy = {'type': 'http', 'anonymity':'high', 'country': 'cn', 'ping': 5}
    while True:
        s = my_request.get_single_session(proxy, new=True, agent=False)
        (flag, r) = my_request.get(logger, url)
        if flag == 0:
            break

    d = pq(r.text)
    data = d('script').text()
    data = ''.join(data)
    (ids, ) = util.re_get_result("showHint\('(\S*)'\);", data)
    id_arr = ids.split(',')

    domain = id_arr[0]
    while True:
        timeout = 10
        try:
            r = s.post("http://www.alexa.cn/api_150710.php",
                       data={"url": id_arr[0],
                             "sig": id_arr[1],
                             "keyt":id_arr[2]
                             },
                       timeout=timeout)
            break
        except Exception,ex:
            logger.exception(ex)
            timeout = 20
Ejemplo n.º 3
0
def query_by_domain(source_company_id, website):
    if website is None or website == "":
        return True

    s = urlsplit(website)
    if s.query != '' or s.fragment != '':
        return True
    if s.path != '' and s.path != '/':
        return True

    s = tldextract.extract(website)
    if s.subdomain != "www" and s.subdomain != "m" and s.subdomain != "":
        return True

    try:
        domain = get_tld(website)
    except:
        return True

    result = conn.get("select count(*) cnt from source_domain where sourceCompanyId=%s and domain=%s", source_company_id, domain)
    if result["cnt"] > 0:
        return True

    url = "http://beian.links.cn/beian.asp?beiantype=domain&keywords=%s" % domain
    (flag, r) = my_request.get(logger, url)
    #logger.info(r.text)
    if flag != 0 or r.status_code != 200:
        return False

    parse_query(source_company_id, r.text)

    return True
Ejemplo n.º 4
0
def query_by_company_name(source_company_id, name):
    if name is None or name == "":
        return True

    #不是正常的中国公司名
    if name.find(".") != -1:
        return True

    result = conn.get("select count(*) cnt from source_domain where sourceCompanyId=%s", source_company_id)
    if result["cnt"] > 0:
        return True

    name = name.replace("_","")
    idx = name.rfind(u"公司")
    if idx != -1:
        name = name[:(idx+len(u"公司"))]

    #url = "http://beian.links.cn/beian.asp?beiantype=zbdwmc&keywords=%s" % name
    url = "http://beian.links.cn/zbdwmc_%s.html" % name
    (flag, r) = my_request.get(logger, url)
    #logger.info(r.text)
    if flag != 0 or r.status_code != 200:
        return False

    parse_query(source_company_id, r.text)

    return True
Ejemplo n.º 5
0
Archivo: 36kr.py Proyecto: yujiye/Codes
def fetch_finance(company_key):
    url = 'https://rong.36kr.com/api/company/' + str(company_key) + '/finance'
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        finance = json.loads(r.text)

    return finance
Ejemplo n.º 6
0
def get_job(company_key, page_no):
    job_contents = []
    items = ['1']
    while len(items) > 0 and page_no < 10:
        job_url = "http://www.jobtong.com/api/enterprises/%s/jobs?page=%s" % (
            company_key, page_no)
        logger.info(job_url)
        (flag, r) = my_request.get(logger, job_url)
        if flag == 0:
            job_data = json.loads(r.text)
            job_result = job_data['items']
            if len(job_result) > 0:
                for job in job_result:
                    job_content = {
                        "date": datetime.datetime.now(),
                        "source": source,
                        "company_key": company_key,
                        "job_key": job['id'],
                        "content": job
                    }
                    job_contents.append(job_content)
            items = job_result
            page_no += 1

    return job_contents
Ejemplo n.º 7
0
Archivo: 36kr.py Proyecto: yujiye/Codes
def fetch_status(company_key):
    url = 'https://rong.36kr.com/api/company/' + str(company_key)
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        status = json.loads(r.text)

    return status
Ejemplo n.º 8
0
Archivo: 36kr.py Proyecto: yujiye/Codes
def fetch_qichacha(company_key):
    url = 'https://rong.36kr.com/api/company/' + str(company_key) + '/qichacha'
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        qichacha = json.loads(r.text)

    return qichacha
Ejemplo n.º 9
0
Archivo: 36kr.py Proyecto: yujiye/Codes
def fetch_rong_header():
    url = 'https://rong.36kr.com/api/p/sm/seo/fragment/header-footer'
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        header = json.loads(r.text)

    return header
Ejemplo n.º 10
0
Archivo: 36kr.py Proyecto: yujiye/Codes
def fetch_crowdfunding(cf_key):
    url = 'https://rong.36kr.com/api/p/crowd-funding/' + str(cf_key)
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        crowdfunding = json.loads(r.text)

    return crowdfunding
Ejemplo n.º 11
0
def find_wechat(name, full_name):
    url = 'http://weixin.sogou.com/weixin?type=1&query=' + name
    cnt = 1
    while cnt < 100:
        result = []
        proxy = {
            'type': 'http',
            'anonymity': 'high',
            'country': 'cn',
            'ping': 5
        }
        my_request.get_single_session(proxy, new=True, agent=False)
        (flag, r) = my_request.get(logger, url)
        if flag == 0:
            if '您的访问过于频繁,为确认本次访问为正常用户行为,需要您协助验证' in r.text:
                find = False

            else:
                d = pq(r.text)

                for rt in d('div.wx-rb'):
                    rt = pq(rt)
                    name = rt('.txt-box > h3').text()
                    wechat_id = rt('.txt-box > h4').text()
                    if len(rt('.s-p3')) == 3:
                        brief = rt('.s-p3:eq(0) > .sp-txt').text()
                        verify = rt('.s-p3:eq(1) > .sp-txt').text()

                        name_str = ''
                        for n in name:
                            if n is None or n == ' ':
                                pass
                            else:
                                name_str += n

                        wechat_id = wechat_id[4:]

                        wechat = {
                            'name': name_str,
                            'id': wechat_id,
                            'brief': brief,
                            'verify_company_name': verify
                        }
                        result.append(wechat)

                if len(result) > 0:
                    find = True

        if not find:
            cnt += 1
        else:
            break

    wechat = []
    for r in result:
        if r['verify_company_name'] == full_name:
            wechat.append(r)

    return wechat
Ejemplo n.º 12
0
def fetch_leader(cf_key):
    url = 'http://dj.jd.com/funding/leaderInverstorDetail/'+cf_key+'.html'
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        if u'东家温馨提示:您查询的内容不存在!' in r.text:
            return None
            
        return r.text
Ejemplo n.º 13
0
Archivo: 36kr.py Proyecto: yujiye/Codes
def fetch_founder(company_key):
    url = 'https://rong.36kr.com/api/company/' + str(
        company_key) + '/founder?pageSize=1000'
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        founder = json.loads(r.text)

    return founder
Ejemplo n.º 14
0
Archivo: 36kr.py Proyecto: yujiye/Codes
def fetch_rong_overview(company_key):
    url = 'https://rong.36kr.com/api/p/sm/seo/summary/rong-company-overview/' + str(
        company_key)
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        overview = json.loads(r.text)

    return overview
Ejemplo n.º 15
0
def query_by_beianhao(source_company_id, beianhao):
    if beianhao is None or beianhao == "":
        return True

    url = "http://beian.links.cn/beianhao_%s.html" % beianhao
    (flag, r) = my_request.get(logger, url)
    #logger.info(r.text)
    if flag != 0 or r.status_code != 200:
        return False

    parse_query(source_company_id, r.text)

    return True
Ejemplo n.º 16
0
def fetch_news(url):
    news_key = url.split('=')[1]
    logger.info("news_key=%s" % news_key)

    (flag, r) = my_request.get(logger, url)
    logger.info("flag=%d", flag)

    if flag == -1:
        return -1

    if r.status_code == 404:
        logger.info("Page Not Found!!!")
        return r.status_code

    if r.status_code != 200:
        return r.status_code

    # print url
    # print r.url

    if r.url != url:
        logger.info("Page Redirect <--")
        return 302

    news_content = {
        "date": datetime.datetime.now(),
        "source": source,
        "url": url,
        "news_key": news_key,
        "content": r.text
    }

    # save
    if news_collection.find_one({
            "source": source,
            "news_key": news_key
    }) is None:
        news_collection.insert_one(news_content)

    msg = {"type": "direct_news", "source": source, "news_key": news_key}
    logger.info(json.dumps(msg))
    kafka_producer.send_messages("pencil_news", json.dumps(msg))

    return 200
Ejemplo n.º 17
0
def fetch_dj():
    url = 'http://dj.jd.com/'
    threads = []
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        d = pq(r.text)
        links = d('a')
        for link in links:
            link = pq(link)
            link = link.attr('href')
            if link is not None and '/funding/details/' in link:
                print link
                fetch_project(link)

        divs = d('.show-text')
        for div in divs:
            div = pq(div)
            link = div('a').attr('href')
            desc = div('p').attr('title')
            fetch_desc(link, desc)
Ejemplo n.º 18
0
Archivo: kr36.py Proyecto: yujiye/Codes
def login():
    while True:
        idx = random.randint(0, len(login_users)-1)
        login_user = login_users[idx]
        logger.info(login_user)

        s = my_request.get_http_session(new=True, agent=False)
        data = {
            "type":"login",
            "bind":False,
            "needCaptcha":False,
            "username":login_user["name"],
            "password":login_user["pwd"],
            "ok_url":"/"

        }
        headers = {
            "Referer":"http://passport.36kr.com"
        }

        try:
            r = s.post("http://passport.36kr.com/passport/sign_in",data=data, headers=headers, timeout=10)
            logger.info(r.text)
        except:
            continue

        if r.status_code != 200:
            continue

        if r.text.strip() != '{"redirect_to":"/"}':
            continue

        (flag, r) = my_request.get(logger,"http://uc.36kr.com/api/user/identity")
        if flag == 0 and r is not None and r.status_code==200:
            result = r.json()
            logger.info(result)
            if result["code"] == 4031:
                break
Ejemplo n.º 19
0
def fetch_project(url):
    (cf_key, ) = util.re_get_result("http://dj.jd.com/funding/details/(\d+).html", url)
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        html = r.text
        support = fetch_support(cf_key)
        focus = fetch_focus(url, cf_key)
        team = fetch_team(cf_key)
        leader = fetch_leader(cf_key)

        bp = fetch_bp(html, url, cf_key)


        content = {'html': html,
                   'team': team,
                   'support': support,
                   'focus': focus
                   }

        project = {"date":datetime.datetime.now(),
                   "source":source,
                   "url":url,
                   "company_key": cf_key,
                   "cf_key":cf_key,
                   "content":content,
                   'leader': leader,
                   'bp': bp
                   }

        result = cf_collection.find_one({"source":source, "company_key":cf_key, 'cf_key': cf_key})
        if result != None:
            cf_collection.replace_one({'_id': result['_id']}, project)
        else:
            cf_collection.insert_one(project)

        msg = {"type":"cf", "source":source, "cf_key":cf_key}
        logger.info(json.dumps(msg))
        kafka_producer.send_messages("crawler_cf_jd_v2", json.dumps(msg))
Ejemplo n.º 20
0
def fetch(url):
    (key, ) = util.re_get_result("https://itjuzi.com/album/(\d+)", url)
    logger.info("key=%s" % key)

    (flag, r) = my_request.get(logger, url)
    logger.info("flag=%d", flag)

    if flag == -1:
        return -1

    if r.status_code == 404:
        logger.info("Page Not Found!!!")
        return r.status_code

    if r.status_code != 200:
        return r.status_code

    if r.url != url:
        logger.info("Page Redirect <--")
        return 302

    content = {
        "date": datetime.datetime.now(),
        "url": url,
        "key": key,
        "content": r.text
    }

    # save
    if collection.find_one({"key": key}) != None:
        collection.delete_one({"key": key})
    collection.insert_one(content)

    # msg = {"type":"itjuzi_album", "key":key}
    # logger.info(json.dumps(msg))
    # kafka_producer.send_messages("itjuzi_album", json.dumps(msg))

    return 200
Ejemplo n.º 21
0
def fetch_support(cf_key):
    url = 'http://dj.jd.com/funding/selectSupportCount.action?projectId='+cf_key+'&minimumAmount=100,000&silkmumAmount=10,000'
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        return r.text
Ejemplo n.º 22
0
def fetch_team(cf_key):
    url = "http://dj.jd.com/funding/findProjectTeam.action?projectId="+cf_key
    (flag, r) = my_request.get(logger, url)
    if flag == 0:
        return r.text
Ejemplo n.º 23
0
def fetch_company(url):
    (company_key, ) = util.re_get_result("http://www.jobtong.com/e/(\d+)", url)

    if company_collection.find_one({
            "source": source,
            "company_key": company_key
    }) != None:
        return 200

    logger.info("company_key=%s" % company_key)

    (flag, r) = my_request.get(logger, url)
    logger.info("flag=%d", flag)

    if flag == -1:
        return -1

    if r.status_code == 404:
        logger.info("Page Not Found!!!")
        return r.status_code

    if r.status_code != 200:
        return r.status_code

    if r.url != url:
        logger.info("Page Redirect <--")
        return 302

    company_content = {
        "date": datetime.datetime.now(),
        "source": source,
        "url": url,
        "company_key": company_key,
        "company_key_int": int(company_key),
        "content": r.text
    }

    doc = lxml.html.fromstring(r.text)

    #company invalid

    #job
    job_contents = get_job(company_key, 1)

    if len(job_contents) > 0:
        #save
        if company_collection.find_one({
                "source": source,
                "company_key": company_key
        }) != None:
            company_collection.delete_one({
                "source": source,
                "company_key": company_key
            })
        company_collection.insert_one(company_content)

        for job in job_contents:
            if job_collection.find_one({
                    "source": source,
                    "company_key": company_key,
                    "news_key": job["job_key"]
            }) == None:
                job_collection.insert_one(job)

        msg = {"type": "company", "source": source, "company_key": company_key}
        logger.info(json.dumps(msg))
        kafka_producer.send_messages("crawler_recruit_jobtong",
                                     json.dumps(msg))
        return 200

    else:
        return 302
Ejemplo n.º 24
0
Archivo: 36kr.py Proyecto: yujiye/Codes
def fetch_cf(data):
    # sleep_time = random.randint(10, 30)
    # time.sleep(sleep_time)

    cf_key = data['id']
    company_key = data['company_id']

    logger.info("cf_key=%s" % cf_key)

    url = 'https://rong.36kr.com/company/' + str(
        company_key) + '/crowFunding?fundingId=' + str(cf_key)

    (flag, r) = my_request.get(logger, url)
    logger.info("flag=%d", flag)

    if flag == -1:
        return -1

    if r.status_code == 404:
        return r.status_code

    if r.status_code != 200:
        return r.status_code

    html = r.text

    finance = fetch_finance(company_key)
    crowdfunding = fetch_crowdfunding(cf_key)
    overview = fetch_rong_overview(company_key)
    # header = fetch_rong_header()
    qichacha = fetch_qichacha(company_key)
    founder = fetch_founder(company_key)
    status = fetch_status(company_key)

    content = {
        'html': html,
        'finance': finance,
        'crowdfunding': crowdfunding,
        'overview': overview,
        # 'header': header,
        'qichacha': qichacha,
        'founder': founder,
        'status': status
    }

    cf_content = {
        "date": datetime.datetime.now(),
        "source": source,
        "url": url,
        "company_key": company_key,
        "cf_key": cf_key,
        "content": content
    }

    result = cf_collection.find_one({
        "source": source,
        "company_key": company_key,
        'cf_key': cf_key
    })
    if result != None:
        cf_collection.replace_one({'_id': result['_id']}, cf_content)
    else:
        cf_collection.insert_one(cf_content)

    msg = {"type": "cf", "source": source, "cf_key": cf_key}
    logger.info(json.dumps(msg))
    kafka_producer.send_messages("crawler_cf_36kr_v2", json.dumps(msg))
Ejemplo n.º 25
0
                r"http://www.itjuzi.com/overview/news/(\d*)$", news_url)
            (news_year, news_month, news_day) = util.re_get_result(
                r'^(\d*)[^\d]*(\d*)[^\d]*(\d*)[^\d]*', news_date_str)
            logger.info(news_title)
            logger.info(news_url)
            logger.info(news_source_domain)
            logger.info(news_date_str)
            logger.info("%s-%s-%s" % (news_year, news_month, news_day))
            logger.info(news_key)

            if news_collection.find_one({
                    "source": source,
                    "company_key": company_key,
                    "news_key": news_key
            }) == None:
                (flag, r) = my_request.get(logger, news_url)
                if flag == -1:
                    continue
                if r.status_code != 200:
                    continue
                #logger.info(r.text)
                f = pq(r.text)
                url = f('iframe').attr("src").strip()
                (flag, r) = my_request.get_no_sesion(logger, url)
                if flag == -1:
                    continue
                if r.status_code != 200:
                    continue
                #logger.info(r.text)

                news_contents.append({
Ejemplo n.º 26
0
Archivo: kr36.py Proyecto: yujiye/Codes
def fetch_company(url):
    (company_key, ) = util.re_get_result("http://rong.36kr.com/api/company/(\d+)", url)
    logger.info("company_key=%s" % company_key)

    company_content = None
    member_contents = []
    news_contents = []
    investor_contents = []
    member_ids = []

    #company base info
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1

    if r.status_code == 404:
        logger.info("Page Not Found!!!")
        return r.status_code

    if r.status_code != 200:
        logger.info("status_code=%d" % r.status_code)
        return r.status_code

    company_base = r.json()
    logger.info(company_base)

    if company_base["code"] != 0:
        return 404

    logger.info(company_base["data"]["company"]["name"])

    #past-finance (investment events)
    url = "http://rong.36kr.com/api/company/%s/past-finance" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    past_finance = r.json()

    #past-investor
    url = "http://rong.36kr.com/api/company/%s/past-investor?pageSize=100" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    past_investor = r.json()

    #funds (非投资人没有查看权限)
    url = "http://rong.36kr.com/api/company/%s/funds" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    funds = r.json()

    #product
    url = "http://rong.36kr.com/api/company/%s/product" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    product = r.json()

    #past-investment
    url = "http://rong.36kr.com/api/company/%s/past-investment" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    past_investment = r.json()

    #company-fa?
    url ="http://rong.36kr.com/api/fa/company-fa?cid=%s" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    company_fa = r.json()

    #founders
    url = "http://rong.36kr.com/api/company/%s/founder?pageSize=1000" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    founders = r.json()

    #employee
    url ="http://rong.36kr.com/api/company/%s/employee?pageSize=1000" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    employees = r.json()

    #former-member
    url = "http://rong.36kr.com/api/company/%s/former-member?pageSize=1000" % company_key
    time.sleep(5)
    (flag, r) = my_request.get(logger, url)
    if flag == -1:
        return -1
    former_members = r.json()

    company_content = {"date":datetime.datetime.now(), "source":source, "url":url, "company_key":company_key,
                       "company_key_int":int(company_key),
                       "company_base":company_base,
                       "past_finance":past_finance,
                       "past_investor":past_investor,
                       "funds":funds,
                       "product":product,
                       "past_investment":past_investment,
                       "company_fa":company_fa,
                       "founders":founders,
                       "employees":employees,
                       "former_members":former_members}

    #member
    for m in founders["data"]["data"]:
        m_id = m["id"]
        member_ids.append(m_id)
    for m in employees["data"]["data"]:
        m_id = m["id"]
        member_ids.append(m_id)
    for m in former_members["data"]["data"]:
        m_id = m["id"]
        member_ids.append(m_id)
    for v in past_investor["data"]["data"]:
        if v["entityType"] == "INDIVIDUAL":
            m_id = v["entityId"]
            member_ids.append(m_id)

    for m_id in member_ids:
        member_key = str(m_id)

        if member_collection.find_one({"source":source, "member_key":member_key}):
            continue

        #basic
        url = "http://rong.36kr.com/api/user/%s/basic" % member_key
        time.sleep(5)
        (flag, r) = my_request.get(logger, url)
        if flag == -1:
            return -1
        member_base = r.json()

        #past-investment
        url = "http://rong.36kr.com/api/user/%s/past-investment" % member_key
        time.sleep(5)
        (flag, r) = my_request.get(logger, url)
        if flag == -1:
            return -1
        member_past_investment = r.json()

        #
        url = "http://rong.36kr.com/api/user/%s/company" % member_key
        time.sleep(5)
        (flag, r) = my_request.get(logger, url)
        if flag == -1:
            return -1
        member_company = r.json()

        #
        url = "http://rong.36kr.com/api/user/%s/work" % member_key
        time.sleep(5)
        (flag, r) = my_request.get(logger, url)
        if flag == -1:
            return -1
        member_work = r.json()

        #
        url = "http://rong.36kr.com/api/p/lead-investor/%s/financing" % member_key
        time.sleep(5)
        (flag, r) = my_request.get(logger, url)
        if flag == -1:
            return -1
        member_financing = r.json()

        member_content = {"date":datetime.datetime.now(), "source":source, "url":url, "member_key":member_key,
                          "member_base":member_base,
                          "member_past_investment":member_past_investment,
                          "member_company":member_company,
                          "member_work":member_work,
                          "member_financing":member_financing}
        member_contents.append(member_content)

    #investor organization
    for e in past_finance["data"]["data"]:
        for investor in e.get("participants",{}):
            investor_key = str(investor["entityId"])

            if investor_collection.find_one({"source":source, "investor_key":investor_key}):
                continue

            #base info
            url = "http://rong.36kr.com/api/organization/%s/basic" % investor_key
            time.sleep(5)
            (flag, r) = my_request.get(logger, url)
            if flag == -1:
                return -1
            investor_base = r.json()

            #staffs
            url = "http://rong.36kr.com/api/organization/%s/user" % investor_key
            time.sleep(5)
            (flag, r) = my_request.get(logger, url)
            if flag == -1:
                return -1
            staffs = r.json()

            #former-member
            url = "http://rong.36kr.com/api/organization/%s/former-member" % investor_key
            time.sleep(5)
            (flag, r) = my_request.get(logger, url)
            if flag == -1:
                return -1
            former_members = r.json()

            investor_content = {"date":datetime.datetime.now(), "source":source, "url":url, "investor_key":investor_key,
                                "investor_base":investor_base,
                                "staffs":staffs,
                                "former_members":former_members}

            investor_contents.append(investor_content)

    #logger.info(company_content)
    #logger.info("************")
    #logger.info(member_contents)
    #logger.info("************")
    #logger.info(investor_contents)

    #save
    if company_collection.find_one({"source":source, "company_key":company_key}) != None:
        company_collection.delete_one({"source":source, "company_key":company_key})
    company_collection.insert_one(company_content)

    for member in member_contents:
        if member_collection.find_one({"source":source, "member_key":member["member_key"]}) == None:
            member_collection.insert_one(member)

    for news in news_contents:
        if news_collection.find_one({"source":source, "company_key":company_key, "news_key":news["news_key"]}) == None:
            news_collection.insert_one(news)

    for investor in investor_contents:
        if investor_collection.find_one({"source":source, "investor_key":investor["investor_key"]}) == None:
            investor_collection.insert_one(investor)

    msg = {"type":"company", "source":source, "company_key":company_key}
    logger.info(json.dumps(msg))
    kafka_producer.send_messages("crawler_kr36_v2", json.dumps(msg))

    return 200
Ejemplo n.º 27
0
Archivo: 36kr.py Proyecto: yujiye/Codes
    return False


if __name__ == "__main__":
    (logger, mongo, kafka_producer, company_collection, member_collection, news_collection, cf_collection) \
        = spider_util.spider_cf_init('jd')

    login()

    flag = True
    while flag:
        i = 1
        url = 'https://rong.36kr.com/api/p/crowd-funding?page=' + str(
            i) + '&per_page=100&status=all'
        (flag, r) = my_request.get(logger, url)
        if flag == 0:
            data = json.loads(r.text)
            data = data['data']
            last_page = data['last_page']
            if last_page > 1:
                for i in xrange(0, last_page):
                    url = 'https://rong.36kr.com/api/p/crowd-funding?page=' + str(
                        i) + '&per_page=100&status=all'
                    (flag, r) = my_request.get(logger, url)
                    if flag == 0:
                        data = json.loads(r.text)['data']['data']
                        for d in data:
                            fetch_cf(d)
                flag = False
            else:
Ejemplo n.º 28
0
def _get(url, cleanup):
  data = my_request.get(url, True)
  if data == None:
    return None

  return cleanup(data)
Ejemplo n.º 29
0
def find_weibo(name):
    url = 'http://s.weibo.com/user/&work=' + name
    cnt = 1
    while cnt < 100:
        result = []
        proxy = {
            'type': 'http',
            'anonymity': 'high',
            'country': 'cn',
            'ping': 5
        }
        my_request.get_single_session(proxy, new=True, agent=False)
        (flag, r) = my_request.get(logger, url)
        if flag == 0:
            d = pq(r.text)

            find = True
            for s in d('script'):
                s = pq(s).text()
                s = ''.join(s)
                if 'STK && STK.pageletM && STK.pageletM.view' in s:
                    s = s.replace('STK && STK.pageletM && STK.pageletM.view(',
                                  '')
                    s = s[0:len(s) - 1]
                    data = json.loads(s)
                    html = data['html']

                    if '你的行为有些异常,请输入验证码' in html:
                        find = False

                    result.append(html)

        if not find or len(result) == 0:
            cnt += 1
        else:
            break

    data = result[2]

    d = pq(data)

    result = []
    for li in d('div.list_person'):
        li = pq(li)
        verify = li('p.person_name > a:eq(1)').attr('title')
        if verify == '微博机构认证':
            name = li('p.person_name > a:eq(0)').text()
            uid = li('p.person_name > a:eq(0)').attr('uid')
            location = li('p.person_addr > span:eq(1)').text()
            link = li('p.person_addr > a').text()

            follow = li('p.person_num > span:eq(0) > a').text()
            fans = li('p.person_num > span:eq(1) > a').text()
            publish = li('p.person_num > span:eq(2) > a').text()

            desc = li('div.person_info > p').text().replace('简介:', '').strip()
            tags = []
            for tag in li('p.person_label:eq(0) > a'):
                tag = pq(tag).text()
                tags.append(tag)

            tags = ','.join(tags)

            verify_company_name = li('person_label:eq(1) > a').text()

            account = {
                'name': name,
                'uid': uid,
                'location': location,
                'link': link,
                'follow': follow,
                'fans': fans,
                'publish': publish,
                'desc': desc,
                'tags': tags,
                'verify_company_name': verify_company_name
            }
            result.append(account)

    weibo = []
    for r in result:
        if r['verify_company_name'] == name or r['name'] == name:
            weibo.append(r)

    return weibo
Ejemplo n.º 30
0
def fetch_job(url):

    urlarr = url.split("=")
    job_key = urlarr[len(urlarr) - 1]
    logger.info("job_key=%s" % job_key)

    (flag, r) = my_request.get(logger, url)
    logger.info("flag=%d", flag)

    if flag == -1:
        return -1

    if r.status_code == 404:
        logger.info("Page Not Found!!!")
        return r.status_code

    if r.status_code != 200:
        #logger.info(r.status_code)
        return r.status_code

    if r.url != url:
        logger.info("Page Redirect <--")
        return 302

    doc = lxml.html.fromstring(r.text)
    company_url = doc.xpath('//div[@class="c_name"]/a/@href')
    if len(company_url) == 0:
        return 200

    company_url = company_url[0]
    (company_key, ) = util.re_get_result("/company/detail/domain=(\S+).html",
                                         company_url)
    logger.info(company_key)

    job_content = {
        "date": datetime.datetime.now(),
        "source": source,
        "url": url,
        "company_key": company_key,
        "job_key": int(job_key),
        "content": r.text
    }

    result = job_collection.find_one({
        "source": source,
        "company_key": company_key,
        "job_key": job_key
    })
    if result == None:
        job_collection.insert_one(job_content)
    # else:
    #     job_collection.replace_one({"_id":result["_id"]}, job_content)

    if company_collection.find_one({
            "source": source,
            "company_key": company_key
    }) == None:
        company_url = 'http://www.neitui.me/' + company_url
        (flag, r) = my_request.get(logger, company_url)
        if flag == -1:
            return -1
        company_content = {
            "date": datetime.datetime.now(),
            "source": source,
            "url": company_url,
            "company_key": company_key,
            "content": r.text
        }
        company_collection.insert_one(company_content)

    msg = {"type": "job", "source": source, "job_key": job_key}
    logger.info(json.dumps(msg))
    kafka_producer.send_messages("crawler_recruit_neitui", json.dumps(msg))

    return 200
Ejemplo n.º 31
0
def fetch_company(url):
    (company_key, ) = util.re_get_result("http://www.itjuzi.com/company/(\d+)",
                                         url)
    logger.info("company_key=%s" % company_key)

    company_content = None
    member_contents = []
    news_contents = []
    investor_contents = []

    (flag, r) = my_request.get(logger, url)
    logger.info("flag=%d", flag)
    if flag == -1:
        return -1

    logger.info("status=%d", r.status_code)

    if r.status_code == 404:
        logger.info("Page Not Found!!!")
        return r.status_code

    if r.status_code != 200:
        #logger.info(r.status_code)
        return r.status_code

    #logger.info(r.text)
    d = pq(r.text)
    product_name = d('div.line-title> span> b').clone().children().remove(
    ).end().text().strip()
    if product_name == "":
        return 404

    company_content = {
        "date": datetime.datetime.now(),
        "source": source,
        "url": url,
        "company_key": company_key,
        "company_key_int": int(company_key),
        "content": r.text
    }

    #members
    lis = d('h4.person-name> a.title')
    for li in lis:
        try:
            l = pq(li)
            href = l.attr("href").strip()
            logger.info(href)
            member_name = l('b> span.c').text().strip()
            (member_key, ) = util.re_get_result(
                r'http://www.itjuzi.com/person/(\d*?)$', href)
            logger.info("member_key=%s, member_name=%s" %
                        (member_key, member_name))

            href = href.replace("http://", "https://")
            #if member_collection.find_one({"source":source, "member_key":member_key}) == None:
            if 1 == 1:
                flag = -1
                while flag != 0:
                    (flag, r) = my_request.get(logger, href)
                    if flag == 0 and r.status_code != 200:
                        flag = -1
                    if flag != 0:
                        my_request.get_https_session(new=True, agent=True)
                #logger.info(r.text)
                member_contents.append({
                    "date": datetime.datetime.now(),
                    "source": source,
                    "url": url,
                    "member_key": member_key,
                    "member_name": member_name,
                    "content": r.text
                })
        except Exception, ex:
            logger.exception(ex)