def insert_funding(sid, roundstr, inv, fundingDate, investor):
    try:
        inv = "".join(inv.split())
        if inv in ["超千万人民币", "千万人民币", "近千万人民币", "过千万人民币", "上千万人民币", "1千万人民币"]:
            inv = "1000万人民币"
        elif inv in [
                "超亿人民币", "近亿人民币", "过亿人民币", "上亿人民币", "亿人民币", "一亿人民币",
                "亿人民币及以上人民币"
        ]:
            inv = "1亿人民币"
        elif inv in ["超千万美元", "千万美元", "近千万美元", "过千万美元", "上千万美元", "1千万美元"]:
            inv = "1000万美元"
        elif inv in ["百万美元", "近百万美元", "过百万美元", "上百万美元", "1百万美元"]:
            inv = "100万美元"
        elif inv in ["百万人民币", "近百万人民币", "过百万人民币", "上百万人民币", "1百万人民币"]:
            inv = "100万人民币"

        if roundstr == "re-A轮":
            roundstr = "Pre-A"
        elif roundstr == "re-IPO":
            roundstr = "Pre-IPO"
        fundingRound, roundStr = itjuzi_helper.getFundingRound(
            unicode(roundstr))
        currency, investment, precise = itjuzi_helper.getMoney(unicode(inv))

        source_funding = {
            "sourceCompanyId": sid,
            "preMoney": None,
            "postMoney": None,
            "investment": investment,
            "precise": precise,
            "round": fundingRound,
            "roundDesc": roundStr,
            "currency": currency,
            "fundingDate": fundingDate,
            "newsUrl": None
        }
        source_investors = []
        source_investor = {
            "name": investor,
            "website": None,
            "description": None,
            "logo_url": None,
            "stage": None,
            "field": None,
            "type": 10020,
            "source": 13100,
            "sourceId": util.md5str(investor)
        }
        source_investors.append(source_investor)

        parser_db_util.save_funding_standard(source_funding, download_crawler,
                                             source_investors)
        # logger.info("%s/%s-------%s/%s/%s/%s", roundstr, inv, fundingRound, investment,precise,currency)
    except:
        logger.info("%s/%s/%s/%s", roundstr, inv, fdate, investor)
        # exit()
    pass
def parse(item):
    if item is None:
        return None

    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)
    logger.info("*** funding ***")

    str = d("a.name").attr("href")
    if str is None:
        return -1

    company_key = str.strip().split("/")[-1]
    logger.info("company_key: %s", company_key)

    conn = db.connect_torndb()
    source_company = conn.get(
        "select * from source_company where source=%s and sourceId=%s", SOURCE,
        company_key)
    conn.close()

    if source_company is None:
        logger.info("this source company doesn't exist yet")
        return None
    else:
        source_company_id = source_company["id"]
        logger.info("sourceComapnyId: %s", source_company_id)
        dateStr = d(
            'div.block> div.titlebar-center> p> span.date').text().strip()
        result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr)
        fundingDate = None
        if result != None:
            (year, month, day) = result
            y = int(year)
            if y >= 2100 and y <= 2109:
                year = 2010 + y % 10
            m = int(month)
            if m > 12:
                m = 12
                month = "12"
            if (m == 4 or m == 6 or m == 9 or m == 11) and int(day) > 30:
                day = "30"
            elif itjuzi_helper.isRunnian(
                    int(year)) and m == 2 and int(day) > 29:
                day = 29
            elif itjuzi_helper.isRunnian(
                    int(year)) == False and m == 2 and int(day) > 28:
                day = 28
            elif int(day) > 31:
                day = 31

            fundingDate = datetime.datetime.strptime(
                "%s-%s-%s" % (year, month, day), '%Y-%m-%d')
        logger.info(fundingDate)

        roundStr = d('span.round').text().strip()
        fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)
        logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr)

        moneyStr = d('span.fina').text().strip()
        (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr)
        logger.info("%s - %s - %s" % (currency, investment, precise))

        investors = []
        fs = d('h4.person-name> b >a.title')
        for f in fs:
            l = pq(f)
            investor_name = l.text().strip()
            if investor_name == "":
                continue
            investor_url = l.attr("href")
            if investor_url is not None and investor_url != "":
                investor_key = investor_url.strip().split("/")[-1]
                investor = {
                    "name": investor_name,
                    "key": investor_key,
                    "url": investor_url,
                    "type": 38001
                }
                investors.append(investor)
                logger.info("Investor: %s, %s, %s", investor_key,
                            investor_name, investor_url)
            else:
                investor_key = None
                temps = investor_name.split(";")
                for name in temps:
                    name = name.strip()
                    if name == "":
                        continue
                    investor = {
                        "name": name,
                        "key": None,
                        "url": None,
                        "type": 38001
                    }
                    investors.append(investor)
                    logger.info("Investor: %s, %s, %s", investor_key, name,
                                investor_url)

    return {
        "sourceCompanyId": source_company_id,
        "fundingDate": fundingDate,
        "fundingRound": fundingRound,
        "roundStr": roundStr,
        "currency": currency,
        "investment": investment,
        "precise": precise,
        "investors": investors
    }

    fundings = []
    # 并购信息
    lis = d('table.list-round> tr')
    for li in lis:
        l = pq(li)
        dateStr = l('td:eq(2)').text().strip()
        result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr)
        fundingDate = None
        if result != None:
            (year, month, day) = result
            fundingDate = datetime.datetime.strptime(
                "%s-%s-%s" % (year, month, day), '%Y-%m-%d')
        logger.info(fundingDate)

        roundStr = l('td.base> a> span').text().strip()
        fundingRound, roundStr = getFundingRound(roundStr)
        logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr)

        moneyStr = l(
            'td.base> a').clone().children().remove().end().text().strip()
        (currency, investment, precise) = getMoney(moneyStr)
        logger.info("%s - %s - %s" % (currency, investment, precise))

        funding = {
            "fundingDate": fundingDate,
            "fundingRound": fundingRound,
            "roundStr": roundStr,
            "currency": currency,
            "investment": investment,
            "precise": precise
        }

        investors = []
        hs = l('td.investor> a')
        for h in hs:
            h = pq(h)
            investor_name = h.text().strip()
            if investor_name == u"并购方未透露" or investor_name == u"未透露" or investor_name == "":
                continue
            investor_url = h.attr("href").strip()
            if investor_url is not None and investor_url != "":
                (investor_key, ) = util.re_get_result(
                    r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url)
            else:
                investor_key = None
            logger.info("Investor: %s, %s, %s", investor_key, investor_name,
                        investor_url)
            investor = {
                "name": investor_name,
                "key": investor_key,
                "url": investor_url,
                "type": 38001
            }
            investors.append(investor)

        funding["investors"] = investors
        fundings.append(funding)

    # funding
    lis = d('table.list-round-v2> tr')
    for li in lis:
        l = pq(li)
        dateStr = l('td> span.date').text().strip()
        result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr)
        fundingDate = None
        if result != None:
            (year, month, day) = result
            fundingDate = datetime.datetime.strptime(
                "%s-%s-%s" % (year, month, day), '%Y-%m-%d')
        logger.info(fundingDate)

        roundStr = l('td.mobile-none> span.round> a').text().strip()
        fundingRound, roundStr = getFundingRound(roundStr)
        logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr)

        moneyStr = l('td> span.finades> a').text().strip()
        (currency, investment, precise) = getMoney(moneyStr)
        logger.info("%s - %s - %s" % (currency, investment, precise))

        funding = {
            "fundingDate": fundingDate,
            "fundingRound": fundingRound,
            "roundStr": roundStr,
            "currency": currency,
            "investment": investment,
            "precise": precise
        }

        investors = []
        hs = l('td:eq(3)> a')
        for h in hs:
            h = pq(h)
            investor_name = h.text().strip()
            investor_url = h.attr("href").strip()
            (investor_key, ) = util.re_get_result(
                r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url)
            logger.info("Investor: %s, %s, %s", investor_key, investor_name,
                        investor_url)
            investor = {
                "name": investor_name,
                "key": investor_key,
                "url": investor_url,
                "type": 38001
            }
            investors.append(investor)

        hs = l('td:eq(3)> span')
        for h in hs:
            h = pq(h)
            investor_name = h.text().strip()
            if investor_name == u"投资方未透露" or investor_name == "":
                continue
            investor_url = None
            investor_key = None
            logger.info("Investor: %s, %s, %s", investor_key, investor_name,
                        investor_url)
            investor = {
                "name": investor_name,
                "key": investor_key,
                "url": investor_url,
                "type": 38001
            }
            investors.append(investor)

        funding["investors"] = investors
        fundings.append(funding)

    logger.info("")
    return fundings
def parseFinance_save(source_company_id, item, sourceId, download_crawler):
    logger.info("parseFinance_save")
    if item is None:
        return None

    d = pq(html.fromstring(item['content'].decode("utf-8")))
    finances = d('.funding-info tbody tr')

    for finance in finances:
        roundStr = d(finance)('td:nth-child(1)').text()
        fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)

        fundingInvestment = d(finance)('.amount').text()
        if fundingInvestment.find('¥ ') >= 0:
            fundingInvestment = fundingInvestment.replace('¥ ', '') + '人民币'
        elif fundingInvestment.find('$ ') >= 0:
            fundingInvestment = fundingInvestment.replace('$ ', '') + '美元'
        else:
            logger.info('not RMB:%s %s', sourceId, fundingInvestment)  # todo
            exit()

        fundingCurrency, fundingInvestment, precise = itjuzi_helper.getMoney(
            fundingInvestment)

        fundingDate = datetime.datetime.strptime(
            d(finance)('.date').text(), '%Y-%m-%d')

        source_funding = {
            "sourceCompanyId": source_company_id,
            "preMoney": None,
            "postMoney": None,
            "investment": fundingInvestment,
            "precise": precise,
            "round": fundingRound,
            "roundDesc": roundStr,
            "currency": fundingCurrency,
            "fundingDate": fundingDate,
            "newsUrl": None
        }

        # logger.info(json.dumps(source_funding, ensure_ascii=False, cls=util.CJsonEncoder))
        if fundingInvestment == 0:
            logger.info("new invest case: %s", sourceId)
            exit()
        logger.info("%s, %s, %s, %s", roundStr, fundingRound,
                    fundingInvestment, fundingCurrency)

        source_investors = []

        investors = d(finance)('.investor a')
        for investor in investors:
            entityName = d(investor).text().strip()
            logger.info(entityName)
            entityId = str(d(investor).attr('href').split('startups/')[-1])

            source_investor = {
                "name": entityName,
                "website": None,
                "description": None,
                "logo_url": None,
                "stage": None,
                "field": None,
                "type": 10020,
                "source": SOURCE,
                "sourceId": entityId
            }
            source_investors.append(source_investor)

        logger.info(
            json.dumps(source_investors,
                       ensure_ascii=False,
                       cls=util.CJsonEncoder))
        try:
            parser_db_util.save_funding_standard(source_funding,
                                                 download_crawler,
                                                 source_investors)
        except:
            pass
Exemple #4
0
def parse(item):
    if item is None:
        return None

    funding_key = item["key"]
    logger.info("funding_key: %s", funding_key)
    data = item["content"]
    logger.info("*** funding ***")

    company_key = data["com_id"]
    logger.info("company_key: %s", company_key)

    source_company = parser_db_util.get_company(13030, company_key)

    if source_company is None:
        logger.info("this source company doesn't exist yet")
        if int(company_key) not in nokeys:
            nokeys.append(int(company_key))
        return None
    else:
        source_company_id = source_company["id"]
        logger.info("sourceComapnyId: %s", source_company_id)
        fundingDate = datetime.datetime.strptime(data["date"], '%Y-%m-%d')
        logger.info(fundingDate)

        roundStr = data["round"]
        fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)
        logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr)

        moneyStr = data["money"] + data["currency"]
        (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr)
        logger.info("%s - %s - %s" % (currency, investment, precise))

        investors = []
        if data.has_key("invsest_with") and isinstance(data["invsest_with"],
                                                       dict):
            for fi in data["invsest_with"]:
                f = data["invsest_with"][fi]
                investor_name = f["invst_name"]
                if investor_name == "" or investor_name == "未透露":
                    continue
                investor_url = None
                if investor_url is not None and investor_url != "":
                    investor_key = investor_url.strip().split("/")[-1]
                    investor = {
                        "name": investor_name,
                        "key": investor_key,
                        "url": investor_url,
                        "type": 38001
                    }
                    investors.append(investor)
                    logger.info("Investor: %s, %s, %s", investor_key,
                                investor_name, investor_url)
                else:
                    investor_key = None
                    temps = investor_name.split(";")
                    for name in temps:
                        name = name.strip()
                        if name == "":
                            continue
                        investor = {
                            "name": name,
                            "key": None,
                            "url": None,
                            "type": 38001
                        }
                        investors.append(investor)
                        logger.info("Investor: %s, %s, %s", investor_key, name,
                                    investor_url)
        return {
            "sourceCompanyId": source_company_id,
            "fundingDate": fundingDate,
            "fundingRound": fundingRound,
            "roundStr": roundStr,
            "currency": currency,
            "investment": investment,
            "precise": precise,
            "investors": investors
        }
Exemple #5
0
def parse(item):
    if item is None:
        return None

    funding_key = item["key"]
    logger.info("funding_key: %s", funding_key)
    html = item["content"]
    #logger.info(html)
    d = pq(html)
    logger.info("*** funding ***")

    str = d("a.name").attr("href")
    if str is None:
        return -1

    company_key = str.strip().split("/")[-1]
    logger.info("company_key: %s", company_key)

    source_company = parser_db_util.get_company(SOURCE, company_key)

    if source_company is None:
        logger.info("this source company doesn't exist yet")
        if int(company_key) not in nokeys:
            nokeys.append(int(company_key))
        return None
    else:
        source_company_id = source_company["id"]
        logger.info("sourceComapnyId: %s", source_company_id)
        dateStr = d('div.title> h1> span').text().strip()
        result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$',dateStr)
        fundingDate = None
        if result != None:
            (year, month, day) = result
            y = int(year)
            if y >= 2100 and y <= 2109:
                year = 2010 + y%10
            m = int(month)
            if m > 12:
                m = 12
                month = "12"
            if (m==4 or m==6 or m==9 or m==11) and int(day)>30:
                day = "30"
            elif itjuzi_helper.isRunnian(int(year)) and m==2 and int(day)>29:
                day = 29
            elif itjuzi_helper.isRunnian(int(year)) == False and m==2 and int(day)>28:
                day = 28
            elif int(day) > 31:
                day = 31

            fundingDate = datetime.datetime.strptime("%s-%s-%s" % (year,month,day), '%Y-%m-%d')
        logger.info(fundingDate)

        roundStr = d('div.block-inc-fina> table> tbody> tr> td> span.round').text().strip()
        fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)
        logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr)

        moneyStr = d('div.block-inc-fina> table> tbody> tr> td> span.fina').text().strip()
        (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr)
        logger.info("%s - %s - %s" % (currency, investment, precise))

        investors = []
        # fs = d('div.right> h4 >a.title')
        # for f in fs:
        #     l = pq(f)
        #     investor_name = l.text().strip()
        #     if investor_name == "":
        #         continue
        #     investor_url = l.attr("href")
        #     if investor_url is not None and investor_url != "":
        #         investor_key = investor_url.strip().split("/")[-1]
        #         investor = {
        #             "name":investor_name,
        #             "key":investor_key,
        #             "url":investor_url,
        #             "type":38001
        #         }
        #         investors.append(investor)
        #         logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url)
        #     else:
        #         investor_key = None
        #         temps = investor_name.split(";")
        #         for name in temps:
        #             name = name.strip()
        #             if name == "":
        #                 continue
        #             investor = {
        #                 "name":name,
        #                 "key":None,
        #                 "url":None,
        #                 "type":38001
        #             }
        #             investors.append(investor)
        #             logger.info("Investor: %s, %s, %s", investor_key, name, investor_url)
        fs = pq(d('div.pad.finan-history> table >tr> td').eq(2))('span> a')
        for f in fs:
            l = pq(f)
            investor_name = l.text().strip()
            if investor_name == "":
                continue
            investor_url = l.attr("href")
            if investor_url is not None and investor_url != "":
                investor_key = investor_url.strip().split("/")[-1]
                investor = {
                    "name": investor_name,
                    "key": investor_key,
                    "url": investor_url,
                    "type": 38001
                }
                investors.append(investor)
                logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url)
            else:
                investor_key = None
                temps = investor_name.split(";")
                for name in temps:
                    name = name.strip()
                    if name == "":
                        continue
                    investor = {
                        "name": name,
                        "key": None,
                        "url": None,
                        "type": 38001
                    }
                    investors.append(investor)
                    logger.info("Investor: %s, %s, %s", investor_key, name, investor_url)
    return {
        "sourceCompanyId":source_company_id,
        "fundingDate":fundingDate,
        "fundingRound":fundingRound,
        "roundStr":roundStr,
        "currency":currency,
        "investment":investment,
        "precise":precise,
        "investors":investors
    }