Beispiel #1
0
def insert_funding(sid, roundstr, inv, fundingDate, investor):
    try:
        inv = "".join(inv.split())
        if inv in ["超千万人民币", "千万人民币", "近千万人民币", "过千万人民币", "上千万人民币", "1千万人民币"]:
            inv = "1000万人民币"
        elif inv in [
                "超亿人民币", "近亿人民币", "过亿人民币", "上亿人民币", "亿人民币", "一亿人民币",
                "亿人民币及以上人民币"
        ]:
            inv = "1亿人民币"
        elif inv in ["超千万美元", "千万美元", "近千万美元", "过千万美元", "上千万美元", "1千万美元"]:
            inv = "1000万美元"
        elif inv in ["百万美元", "近百万美元", "过百万美元", "上百万美元", "1百万美元"]:
            inv = "100万美元"
        elif inv in ["百万人民币", "近百万人民币", "过百万人民币", "上百万人民币", "1百万人民币"]:
            inv = "100万人民币"

        if roundstr == "re-A轮":
            roundstr = "Pre-A"
        elif roundstr == "re-IPO":
            roundstr = "Pre-IPO"
        fundingRound, roundStr = itjuzi_helper.getFundingRound(
            unicode(roundstr))
        currency, investment, precise = itjuzi_helper.getMoney(unicode(inv))

        source_funding = {
            "sourceCompanyId": sid,
            "preMoney": None,
            "postMoney": None,
            "investment": investment,
            "precise": precise,
            "round": fundingRound,
            "roundDesc": roundStr,
            "currency": currency,
            "fundingDate": fundingDate,
            "newsUrl": None
        }
        source_investors = []
        source_investor = {
            "name": investor,
            "website": None,
            "description": None,
            "logo_url": None,
            "stage": None,
            "field": None,
            "type": 10020,
            "source": 13100,
            "sourceId": util.md5str(investor)
        }
        source_investors.append(source_investor)

        parser_db_util.save_funding_standard(source_funding, download_crawler,
                                             source_investors)
        # logger.info("%s/%s-------%s/%s/%s/%s", roundstr, inv, fundingRound, investment,precise,currency)
    except:
        logger.info("%s/%s/%s/%s", roundstr, inv, fdate, investor)
        # exit()
    pass
Beispiel #2
0
def parse_base(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    company_short_name = ""
    product_name = d('div.line-title> span> b').clone().children().remove(
    ).end().text().strip()
    temps = product_name.split("/", 1)
    if len(temps) == 2:
        product_name = temps[0].strip()
        company_short_name = temps[1].strip()
    if company_short_name == "":
        company_short_name = product_name
    logger.info("product name: %s" % product_name)
    logger.info("company short name: %s" % company_short_name)

    company_name = d('div.des-more> div').eq(0).text().strip().replace(
        "公司全称:", "")
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""
    company_name = util.norm_company_name(company_name)
    logger.info("company name: %s" % company_name)

    if company_short_name == "" and company_name == "":
        return

    establish_date = None
    str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:", "")
    result = util.re_get_result('(\d*?).(\d*?)$', str)
    if result != None:
        (year, month) = result
        try:
            if int(month) > 12:
                month = "1"
        except:
            month = "1"
        establish_date = datetime.datetime.strptime("%s-%s-1" % (year, month),
                                                    '%Y-%m-%d')
    logger.info("establish date: %s" % establish_date)

    locationId = 0
    str = d('span.loca').text().strip()
    #logger.info(str)
    result = util.re_get_result(u'(.*?)·(.*?)$', str)
    if result != None:
        (province, city) = result
        province = province.strip()
        city = city.strip()
        logger.info("location: %s-%s" % (province, city))

        locationId = 0
        conn = db.connect_torndb()
        result = conn.get("select * from location where locationName=%s", city)
        if result != None:
            locationId = result["locationId"]
        else:
            result = conn.get("select * from location where locationName=%s",
                              province)
            if result != None:
                locationId = result["locationId"]
        conn.close()
    logger.info("locationId: %d" % locationId)

    company_status = 2010
    str = d('div.des-more> div').eq(2).text().strip()
    if str == "已关闭":
        company_status = 2020
    logger.info("company_status: %d" % company_status)

    funding_type = 0
    str = d("span.tag.bg-c").text().strip()
    logger.info("融资需求: %s" % str)
    if str == "融资需求 · 需要融资":
        funding_type = 8020
    elif str == "融资需求 · 寻求收购":
        funding_type = 8020
    logger.info("funding_type=%d" % funding_type)

    field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
    logger.info("field: %s" % field)

    sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
    logger.info("sub field: %s" % sub_field)

    tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(
        " ", ",")
    logger.info("tags: %s" % tags)

    desc = d("div.des").text().strip()
    logger.info("desc: %s" % desc)

    #logo
    logo = d("div.pic >img").attr("src")
    logger.info("logo: %s", logo)

    website = d('div.link-line> a').attr("href").strip()
    if website == "http://%e6%9a%82%e6%97%a0":
        website = ""
    website = util.norm_url(website)
    logger.info("website: %s" % website)

    artifacts = [{
        "type": 4010,
        "name": product_name,
        "desc": desc,
        "link": website
    }]

    #获投状态
    roundStr = d('span.t-small.c-green').text().replace("(", "").replace(
        ")", "").replace("获投状态:", "").strip()
    fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)
    logger.info("获投状态: %d, %s", fundingRound, roundStr)

    logger.info("")

    return {
        "shortName": company_short_name,
        "fullName": company_name,
        "productName": product_name,
        "description": desc,
        "brief": "",
        "round": 0,
        "roundDesc": "",
        "companyStatus": company_status,
        "fundingType": funding_type,
        "locationId": locationId,
        "establishDate": establish_date,
        "logo": logo,
        "sourceId": company_key,
        "field": field,
        "subField": sub_field,
        "tags": tags,
        "artifacts": artifacts
    }
Beispiel #3
0
def abnormal(x):
    if pd.isnull(x.companyId): return '匹配不到公司'
    if pd.isnull(x.country) and pd.notnull(x.fundingDate):
        collection = mongo.raw.qmp_rz_parser
        if collection.find_one({'product': x.xiniuName
                                }) is None and collection.find_one(
                                    {'company': x.xiniufullName}) is None:
            return '烯牛独家'
        else:
            return '其它'

    thirdTime = x[u'time'].strip()
    # if len(thirdTime) < 5: return '融资时间不对'

    try:
        thirdTime = datetime.datetime.strptime(thirdTime, '%Y.%m.%d')
    except:
        try:
            thirdTime = datetime.datetime.strptime(thirdTime, '%Y.%m')
        except:
            try:
                thirdTime = datetime.datetime.strptime(
                    x[u'orderbyrztime'].strip(), '%Y%m%d')
            except:
                return '融资时间不对'

    # if x.fundingDate.year != thirdTime.year or x.fundingDate.month != thirdTime.month: return '融资时间不匹配'
    import re
    reg = re.findall(u'\d+[万亿]', x[u'money'])
    if len(reg) == 0:
        source_investment = None
    else:
        amout = reg[0][:-1]
        source_investment = float(amout) * 10000 if u'万' in x[
            u'money'] else float(amout) * 10000 * 10000

    roundstr = x[u'jieduan']
    if roundstr == "re-A轮":
        roundstr = "Pre-A"
    elif roundstr == "re-IPO":
        roundstr = "Pre-IPO"
    fundingRound, roundStr = itjuzi_helper.getFundingRound(unicode(roundstr))
    if fundingRound == 1011: fundingRound = 1010

    # print thirdTime,source_investment
    source_funding = {
        'id': 1,
        'fundingDate': thirdTime,
        'investment': source_investment,
        'round': fundingRound
    }
    if pd.isnull(x['corporateId']): print x

    if compare_select(
            source_funding,
            conn.query(
                'select * from funding where corporateId=%s and (active="Y" or active is null)',
                x['corporateId'])) is False:
        xiniufunding = conn.get(
            'select * from funding where corporateId=%s and (active="Y" or active is null) order by round desc limit 1',
            x['corporateId'])

        xiniuRound = 0
        if xiniufunding is not None:
            xiniuRound = xiniufunding['round']
        else:
            return '烯牛无融资'

        if fundingRound > 0 and fundingRound > xiniuRound: return '企名片轮次靠后'

        return '烯牛轮次靠后'
    return '都匹配'
Beispiel #4
0
def parse(item):
    if item is None:
        return None

    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)
    logger.info("*** funding ***")

    str = d("a.name").attr("href")
    if str is None:
        return -1

    company_key = str.strip().split("/")[-1]
    logger.info("company_key: %s", company_key)

    conn = db.connect_torndb()
    source_company = conn.get(
        "select * from source_company where source=%s and sourceId=%s", SOURCE,
        company_key)
    conn.close()

    if source_company is None:
        logger.info("this source company doesn't exist yet")
        return None
    else:
        source_company_id = source_company["id"]
        logger.info("sourceComapnyId: %s", source_company_id)
        dateStr = d(
            'div.block> div.titlebar-center> p> span.date').text().strip()
        result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr)
        fundingDate = None
        if result != None:
            (year, month, day) = result
            y = int(year)
            if y >= 2100 and y <= 2109:
                year = 2010 + y % 10
            m = int(month)
            if m > 12:
                m = 12
                month = "12"
            if (m == 4 or m == 6 or m == 9 or m == 11) and int(day) > 30:
                day = "30"
            elif itjuzi_helper.isRunnian(
                    int(year)) and m == 2 and int(day) > 29:
                day = 29
            elif itjuzi_helper.isRunnian(
                    int(year)) == False and m == 2 and int(day) > 28:
                day = 28
            elif int(day) > 31:
                day = 31

            fundingDate = datetime.datetime.strptime(
                "%s-%s-%s" % (year, month, day), '%Y-%m-%d')
        logger.info(fundingDate)

        roundStr = d('span.round').text().strip()
        fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)
        logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr)

        moneyStr = d('span.fina').text().strip()
        (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr)
        logger.info("%s - %s - %s" % (currency, investment, precise))

        investors = []
        fs = d('h4.person-name> b >a.title')
        for f in fs:
            l = pq(f)
            investor_name = l.text().strip()
            if investor_name == "":
                continue
            investor_url = l.attr("href")
            if investor_url is not None and investor_url != "":
                investor_key = investor_url.strip().split("/")[-1]
                investor = {
                    "name": investor_name,
                    "key": investor_key,
                    "url": investor_url,
                    "type": 38001
                }
                investors.append(investor)
                logger.info("Investor: %s, %s, %s", investor_key,
                            investor_name, investor_url)
            else:
                investor_key = None
                temps = investor_name.split(";")
                for name in temps:
                    name = name.strip()
                    if name == "":
                        continue
                    investor = {
                        "name": name,
                        "key": None,
                        "url": None,
                        "type": 38001
                    }
                    investors.append(investor)
                    logger.info("Investor: %s, %s, %s", investor_key, name,
                                investor_url)

    return {
        "sourceCompanyId": source_company_id,
        "fundingDate": fundingDate,
        "fundingRound": fundingRound,
        "roundStr": roundStr,
        "currency": currency,
        "investment": investment,
        "precise": precise,
        "investors": investors
    }

    fundings = []
    # 并购信息
    lis = d('table.list-round> tr')
    for li in lis:
        l = pq(li)
        dateStr = l('td:eq(2)').text().strip()
        result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr)
        fundingDate = None
        if result != None:
            (year, month, day) = result
            fundingDate = datetime.datetime.strptime(
                "%s-%s-%s" % (year, month, day), '%Y-%m-%d')
        logger.info(fundingDate)

        roundStr = l('td.base> a> span').text().strip()
        fundingRound, roundStr = getFundingRound(roundStr)
        logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr)

        moneyStr = l(
            'td.base> a').clone().children().remove().end().text().strip()
        (currency, investment, precise) = getMoney(moneyStr)
        logger.info("%s - %s - %s" % (currency, investment, precise))

        funding = {
            "fundingDate": fundingDate,
            "fundingRound": fundingRound,
            "roundStr": roundStr,
            "currency": currency,
            "investment": investment,
            "precise": precise
        }

        investors = []
        hs = l('td.investor> a')
        for h in hs:
            h = pq(h)
            investor_name = h.text().strip()
            if investor_name == u"并购方未透露" or investor_name == u"未透露" or investor_name == "":
                continue
            investor_url = h.attr("href").strip()
            if investor_url is not None and investor_url != "":
                (investor_key, ) = util.re_get_result(
                    r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url)
            else:
                investor_key = None
            logger.info("Investor: %s, %s, %s", investor_key, investor_name,
                        investor_url)
            investor = {
                "name": investor_name,
                "key": investor_key,
                "url": investor_url,
                "type": 38001
            }
            investors.append(investor)

        funding["investors"] = investors
        fundings.append(funding)

    # funding
    lis = d('table.list-round-v2> tr')
    for li in lis:
        l = pq(li)
        dateStr = l('td> span.date').text().strip()
        result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr)
        fundingDate = None
        if result != None:
            (year, month, day) = result
            fundingDate = datetime.datetime.strptime(
                "%s-%s-%s" % (year, month, day), '%Y-%m-%d')
        logger.info(fundingDate)

        roundStr = l('td.mobile-none> span.round> a').text().strip()
        fundingRound, roundStr = getFundingRound(roundStr)
        logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr)

        moneyStr = l('td> span.finades> a').text().strip()
        (currency, investment, precise) = getMoney(moneyStr)
        logger.info("%s - %s - %s" % (currency, investment, precise))

        funding = {
            "fundingDate": fundingDate,
            "fundingRound": fundingRound,
            "roundStr": roundStr,
            "currency": currency,
            "investment": investment,
            "precise": precise
        }

        investors = []
        hs = l('td:eq(3)> a')
        for h in hs:
            h = pq(h)
            investor_name = h.text().strip()
            investor_url = h.attr("href").strip()
            (investor_key, ) = util.re_get_result(
                r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url)
            logger.info("Investor: %s, %s, %s", investor_key, investor_name,
                        investor_url)
            investor = {
                "name": investor_name,
                "key": investor_key,
                "url": investor_url,
                "type": 38001
            }
            investors.append(investor)

        hs = l('td:eq(3)> span')
        for h in hs:
            h = pq(h)
            investor_name = h.text().strip()
            if investor_name == u"投资方未透露" or investor_name == "":
                continue
            investor_url = None
            investor_key = None
            logger.info("Investor: %s, %s, %s", investor_key, investor_name,
                        investor_url)
            investor = {
                "name": investor_name,
                "key": investor_key,
                "url": investor_url,
                "type": 38001
            }
            investors.append(investor)

        funding["investors"] = investors
        fundings.append(funding)

    logger.info("")
    return fundings
Beispiel #5
0
                    name_helper.company_name_normalize(unicode(fn)))
        fullName = name_helper.company_name_normalize(unicode(fullName))

        roundstr = names[4]
        inv = names[5]
        fdate = names[6]
        investors = []
        if names[7] is not None and names[7].strip() != "":
            investors.extend(names[7].split("/"))
        if names[8] is not None and names[8].strip() != "":
            investors.extend(names[8].split("/"))

        if len(investors) == 0:
            continue

        fundingRound, roundStr = itjuzi_helper.getFundingRound(
            unicode(roundstr))

        if fullName not in namesa: namesa.append(fullName)
        if fundingRound is not None and fundingRound > 0:
            if cs.has_key(fullName) is False:
                cs[fullName] = {fundingRound: [investors]}
            else:
                if cs[fullName].has_key(fundingRound) is False:
                    cs[fullName][fundingRound] = [investors]
                else:
                    cs[fullName][fundingRound].append(investors)

    # logger.info(json.dumps(cs, ensure_ascii=False, cls=util.CJsonEncoder))
    logger.info(len(cs))
    logger.info(len(namesa))
Beispiel #6
0
def parse(item):
    if item is None:
        return None

    funding_key = item["key"]
    logger.info("funding_key: %s", funding_key)
    data = item["content"]
    logger.info("*** funding ***")

    company_key = data["com_id"]
    logger.info("company_key: %s", company_key)

    source_company = parser_db_util.get_company(13030, company_key)

    if source_company is None:
        logger.info("this source company doesn't exist yet")
        if int(company_key) not in nokeys:
            nokeys.append(int(company_key))
        return None
    else:
        source_company_id = source_company["id"]
        logger.info("sourceComapnyId: %s", source_company_id)
        fundingDate = datetime.datetime.strptime(data["date"], '%Y-%m-%d')
        logger.info(fundingDate)

        roundStr = data["round"]
        fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)
        logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr)

        moneyStr = data["money"] + data["currency"]
        (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr)
        logger.info("%s - %s - %s" % (currency, investment, precise))

        investors = []
        if data.has_key("invsest_with") and isinstance(data["invsest_with"],
                                                       dict):
            for fi in data["invsest_with"]:
                f = data["invsest_with"][fi]
                investor_name = f["invst_name"]
                if investor_name == "" or investor_name == "未透露":
                    continue
                investor_url = None
                if investor_url is not None and investor_url != "":
                    investor_key = investor_url.strip().split("/")[-1]
                    investor = {
                        "name": investor_name,
                        "key": investor_key,
                        "url": investor_url,
                        "type": 38001
                    }
                    investors.append(investor)
                    logger.info("Investor: %s, %s, %s", investor_key,
                                investor_name, investor_url)
                else:
                    investor_key = None
                    temps = investor_name.split(";")
                    for name in temps:
                        name = name.strip()
                        if name == "":
                            continue
                        investor = {
                            "name": name,
                            "key": None,
                            "url": None,
                            "type": 38001
                        }
                        investors.append(investor)
                        logger.info("Investor: %s, %s, %s", investor_key, name,
                                    investor_url)
        return {
            "sourceCompanyId": source_company_id,
            "fundingDate": fundingDate,
            "fundingRound": fundingRound,
            "roundStr": roundStr,
            "currency": currency,
            "investment": investment,
            "precise": precise,
            "investors": investors
        }
Beispiel #7
0
def parseFinance_save(source_company_id, item, sourceId, download_crawler):
    logger.info("parseFinance_save")
    if item is None:
        return None

    d = pq(html.fromstring(item['content'].decode("utf-8")))
    finances = d('.funding-info tbody tr')

    for finance in finances:
        roundStr = d(finance)('td:nth-child(1)').text()
        fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)

        fundingInvestment = d(finance)('.amount').text()
        if fundingInvestment.find('¥ ') >= 0:
            fundingInvestment = fundingInvestment.replace('¥ ', '') + '人民币'
        elif fundingInvestment.find('$ ') >= 0:
            fundingInvestment = fundingInvestment.replace('$ ', '') + '美元'
        else:
            logger.info('not RMB:%s %s', sourceId, fundingInvestment)  # todo
            exit()

        fundingCurrency, fundingInvestment, precise = itjuzi_helper.getMoney(
            fundingInvestment)

        fundingDate = datetime.datetime.strptime(
            d(finance)('.date').text(), '%Y-%m-%d')

        source_funding = {
            "sourceCompanyId": source_company_id,
            "preMoney": None,
            "postMoney": None,
            "investment": fundingInvestment,
            "precise": precise,
            "round": fundingRound,
            "roundDesc": roundStr,
            "currency": fundingCurrency,
            "fundingDate": fundingDate,
            "newsUrl": None
        }

        # logger.info(json.dumps(source_funding, ensure_ascii=False, cls=util.CJsonEncoder))
        if fundingInvestment == 0:
            logger.info("new invest case: %s", sourceId)
            exit()
        logger.info("%s, %s, %s, %s", roundStr, fundingRound,
                    fundingInvestment, fundingCurrency)

        source_investors = []

        investors = d(finance)('.investor a')
        for investor in investors:
            entityName = d(investor).text().strip()
            logger.info(entityName)
            entityId = str(d(investor).attr('href').split('startups/')[-1])

            source_investor = {
                "name": entityName,
                "website": None,
                "description": None,
                "logo_url": None,
                "stage": None,
                "field": None,
                "type": 10020,
                "source": SOURCE,
                "sourceId": entityId
            }
            source_investors.append(source_investor)

        logger.info(
            json.dumps(source_investors,
                       ensure_ascii=False,
                       cls=util.CJsonEncoder))
        try:
            parser_db_util.save_funding_standard(source_funding,
                                                 download_crawler,
                                                 source_investors)
        except:
            pass
Beispiel #8
0
def parse(item):
    if item is None:
        return None

    funding_key = item["key"]
    logger.info("funding_key: %s", funding_key)
    html = item["content"]
    #logger.info(html)
    d = pq(html)
    logger.info("*** funding ***")

    str = d("a.name").attr("href")
    if str is None:
        return -1

    company_key = str.strip().split("/")[-1]
    logger.info("company_key: %s", company_key)

    source_company = parser_db_util.get_company(SOURCE, company_key)

    if source_company is None:
        logger.info("this source company doesn't exist yet")
        if int(company_key) not in nokeys:
            nokeys.append(int(company_key))
        return None
    else:
        source_company_id = source_company["id"]
        logger.info("sourceComapnyId: %s", source_company_id)
        dateStr = d('div.title> h1> span').text().strip()
        result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$',dateStr)
        fundingDate = None
        if result != None:
            (year, month, day) = result
            y = int(year)
            if y >= 2100 and y <= 2109:
                year = 2010 + y%10
            m = int(month)
            if m > 12:
                m = 12
                month = "12"
            if (m==4 or m==6 or m==9 or m==11) and int(day)>30:
                day = "30"
            elif itjuzi_helper.isRunnian(int(year)) and m==2 and int(day)>29:
                day = 29
            elif itjuzi_helper.isRunnian(int(year)) == False and m==2 and int(day)>28:
                day = 28
            elif int(day) > 31:
                day = 31

            fundingDate = datetime.datetime.strptime("%s-%s-%s" % (year,month,day), '%Y-%m-%d')
        logger.info(fundingDate)

        roundStr = d('div.block-inc-fina> table> tbody> tr> td> span.round').text().strip()
        fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr)
        logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr)

        moneyStr = d('div.block-inc-fina> table> tbody> tr> td> span.fina').text().strip()
        (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr)
        logger.info("%s - %s - %s" % (currency, investment, precise))

        investors = []
        # fs = d('div.right> h4 >a.title')
        # for f in fs:
        #     l = pq(f)
        #     investor_name = l.text().strip()
        #     if investor_name == "":
        #         continue
        #     investor_url = l.attr("href")
        #     if investor_url is not None and investor_url != "":
        #         investor_key = investor_url.strip().split("/")[-1]
        #         investor = {
        #             "name":investor_name,
        #             "key":investor_key,
        #             "url":investor_url,
        #             "type":38001
        #         }
        #         investors.append(investor)
        #         logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url)
        #     else:
        #         investor_key = None
        #         temps = investor_name.split(";")
        #         for name in temps:
        #             name = name.strip()
        #             if name == "":
        #                 continue
        #             investor = {
        #                 "name":name,
        #                 "key":None,
        #                 "url":None,
        #                 "type":38001
        #             }
        #             investors.append(investor)
        #             logger.info("Investor: %s, %s, %s", investor_key, name, investor_url)
        fs = pq(d('div.pad.finan-history> table >tr> td').eq(2))('span> a')
        for f in fs:
            l = pq(f)
            investor_name = l.text().strip()
            if investor_name == "":
                continue
            investor_url = l.attr("href")
            if investor_url is not None and investor_url != "":
                investor_key = investor_url.strip().split("/")[-1]
                investor = {
                    "name": investor_name,
                    "key": investor_key,
                    "url": investor_url,
                    "type": 38001
                }
                investors.append(investor)
                logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url)
            else:
                investor_key = None
                temps = investor_name.split(";")
                for name in temps:
                    name = name.strip()
                    if name == "":
                        continue
                    investor = {
                        "name": name,
                        "key": None,
                        "url": None,
                        "type": 38001
                    }
                    investors.append(investor)
                    logger.info("Investor: %s, %s, %s", investor_key, name, investor_url)
    return {
        "sourceCompanyId":source_company_id,
        "fundingDate":fundingDate,
        "fundingRound":fundingRound,
        "roundStr":roundStr,
        "currency":currency,
        "investment":investment,
        "precise":precise,
        "investors":investors
    }
Beispiel #9
0
def parse_base(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    company_short_name = ""
    product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip()
    if product_name is None or product_name.strip() == "":
        product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip()
    temps = product_name.split("/",1)
    if len(temps) == 2:
        product_name = temps[0].strip()
        company_short_name = temps[1].strip()
    if company_short_name == "":
        company_short_name = product_name
    logger.info("product name: %s" % product_name)
    logger.info("company short name: %s" % company_short_name)

    company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","")
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    if company_name is None or company_name.strip() == "":
        try:
            company_name = d('div.des-more> h2').text().strip()
        except:
            pass
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    company_name = name_helper.company_name_normalize(company_name)
    logger.info("company name: %s" % company_name)

    if company_short_name == "" and company_name == "":
        return

    establish_date = None
    str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","")
    result = util.re_get_result('(\d*)\.(\d*)',str)
    if result != None:
        (year, month) = result
        try:
            if int(month) > 12:
                month = "1"
        except:
            month = "1"
        establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
    logger.info("establish date: %s" % establish_date)

    locationId=0
    str = d('span.loca').text().strip()
    #logger.info(str)
    result = util.re_get_result(u'(.*?)·(.*?)$',str)
    if result != None:
        (province, city) = result
        province = province.strip()
        city = city.strip()
        logger.info("location: %s-%s" % (province, city))

        locationId = 0
        result = parser_db_util.get_location(city)
        if result != None:
            locationId = result["locationId"]
        else:
            result = parser_db_util.get_location(province)
            if result != None:
                locationId = result["locationId"]

    if locationId == 0:
        loc1,loc2 = name_helper.get_location_from_company_name(company_name)
        if loc1 is not None:
            result = parser_db_util.get_location(loc1)
            if result != None:
                locationId = result["locationId"]
    logger.info("locationId: %d" % locationId)

    company_status = 2010
    str = d('div.des-more> div').eq(2).text().strip()
    if str == "已关闭":
        company_status = 2020
    logger.info("company_status: %d" % company_status)

    funding_type = 0
    str = d("span.tag.bg-c").text().strip()
    logger.info("融资需求: %s" % str)
    if str == "融资需求 · 需要融资":
        funding_type = 8020
    elif str == "融资需求 · 寻求收购":
        funding_type = 8020
    logger.info("funding_type=%d" % funding_type)
    try:
        brief = d("h2.seo-slogan").text().strip()
    except:
        brief = ""
    logger.info("brief: %s" % brief)

    if brief.find("暂未收录"):
        brief = ""
    field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
    logger.info("field: %s" % field)

    sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
    logger.info("sub field: %s" % sub_field)

    tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",")
    logger.info("tags: %s" % tags)

    desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\
        replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip()
    logger.info("********desc: %s" % desc)

    #logo
    logo = d("div.pic >img").attr("src")
    #if logo:
    #    logo = logo.replace("http://", "https://")
    logger.info("logo: %s", logo)


    # website = d('div.link-line> a').text().strip()
    # if website is None or website == "":
    #     website = d('div.link-line> a.webTink').text().strip()
    # if website is None or website == "":
    #     try:
    #         logger.info("here")
    #         website = d('div.link-line> span.weblink> a').eq(1).text().strip()
    #         logger.info(website)
    #     except:
    #         pass
    artifacts = []
    for ty in [1,2,3]:
        if ty == 1:
            was = d('div.link-line> a')
        else:
            was = d('div.link-line> span.weblink,span.webTink> a')

        for wa in was:
            webs =[]

            try:
                website = pq(wa).attr("href").strip()
                if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:

            #     website = pq(wa).text().strip()
            except:
                pass
            try:
                website = pq(wa).text().strip()
                if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:
            #     website = pq(wa).text().strip()
            except:
                pass

            #
            # if website=="http://%e6%9a%82%e6%97%a0":
            #     website = ""
            # website = url_helper.url_normalize(website)
            # logger.info("website: %s" % website)

            # artifacts = []
            for website in webs:
                type, app_market, app_id = url_helper.get_market(website)
                if type == 4010:
                    flag, domain = url_helper.get_domain(website)
                    if flag is not None:
                        if flag is False:
                            domain = None
                        artifacts.append({
                            "type":4010,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })

                elif type == 4020:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4020,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": website
                        })

                elif type == 4030:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4030,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": None
                        })

                elif type == 4040:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                                "type":4040,
                                "name":product_name,
                                "desc":desc,
                                "link":website,
                                "domain": domain
                        })
                elif type == 4050:
                    domain = None
                    if app_market == 16010 or app_market == 16020:
                        android_app = parser_db_util.find_android_market(app_market, app_id)
                        if android_app:
                            domain = android_app["apkname"]
                    else:
                        domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type":4050,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })


    #获投状态
    roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip()
    fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr)
    logger.info("获投状态: %d, %s", fundingRound, roundStr)

    logger.info("")


    return {
        "shortName": company_short_name,
        "fullName": company_name if company_name is not None and company_name.strip() != "" else None,
        "productName": product_name,
        "description": desc,
        "brief": brief,
        "round": fundingRound,
        "roundDesc": roundStr,
        "companyStatus": company_status,
        "fundingType": funding_type,
        "locationId": locationId,
        "establishDate": establish_date,
        "logo": logo,
        "sourceId": company_key,
        "field": field,
        "subField": sub_field,
        "tags": tags,
        "type":41010,
        "artifacts":artifacts
    }