Example #1
0
def get_logo_id(source, key, type, logo_url):
    # logger.info("%s, %s, %s, %s" % (source,key,type,logo_url))
    if logo_url.strip() == "":
        return None

    logo_id = None
    conn = db.connect_torndb()

    name = "logo"
    if type == 'company':
        type = 'source_company'
    elif type == 'member':
        type = 'source_member'
        name = "photo"
    elif type == 'investor':
        type = 'source_investor'

    source_company = conn.get(
        "select * from " + type + " where source=%s and sourceId=%s", source,
        key)
    if source_company == None or source_company[
            name] == None or source_company[name] == "":
        image_value = my_request.get_image(logger, logo_url)
        if image_value is not None:
            logo_id = imgfs.put(image_value,
                                content_type='jpeg',
                                filename='%s_%s_%s.jpg' % (source, type, key))
            # logger.info("gridfs logo_id=%s" % logo_id)
    else:
        logo_id = source_company[name]

    conn.close()
    return logo_id
Example #2
0
def save_base(r):
    company_key = r["sourceId"]
    conn = db.connect_torndb()

    logo_id = None
    source_company = conn.get(
        "select * from source_company where source=%s and sourceId=%s", SOURCE,
        company_key)
    if source_company is None or source_company[
            "logo"] is None or source_company["logo"] == "":
        log_url = r["logo"]
        if log_url is not None and len(log_url.strip()) > 0:
            logger.info(log_url)
            image_value = None
            retries = 0
            while image_value is None:
                image_value = my_request.get_image(logger, log_url, agent=True)
                retries += 1
                if retries >= 5:
                    break
            if image_value is not None:
                logo_id = imgfs.put(image_value,
                                    content_type='jpeg',
                                    filename='company_%s_%s.jpg' %
                                    (SOURCE, company_key))
    else:
        logo_id = source_company["logo"]
    logger.info("gridfs logo_id=%s" % logo_id)

    if source_company == None:
        source_company_id = conn.insert(
            "insert source_company(name,fullName,description,brief,\
                    round,roundDesc,companyStatus,fundingType,locationId,establishDate,logo,\
                    source,sourceId,createTime,modifyTime,\
                    field,subField,tags) \
                    values(%s,%s,%s,%s,\
                    %s,%s,%s,%s,%s,%s,%s,\
                    %s,%s,now(),now(),\
                    %s,%s,%s)", r["productName"], r["fullName"],
            r["description"], r["brief"], r["round"], r["roundDesc"],
            r["companyStatus"], r["fundingType"], r["locationId"],
            r["establishDate"], logo_id, SOURCE, company_key, r["field"],
            r["subField"], r["tags"])
    else:
        source_company_id = source_company["id"]
        conn.update(
            "update source_company set \
                    name=%s,fullName=%s,description=%s, brief=%s, \
                    round=%s,roundDesc=%s,companyStatus=%s,fundingType=%s,locationId=%s,establishDate=%s,logo=%s, \
                    field=%s,subField=%s,tags=%s,\
                    modifyTime=now() \
                    where id=%s", r["productName"], r["fullName"],
            r["description"], r["brief"], r["round"], r["roundDesc"],
            r["companyStatus"], r["fundingType"], r["locationId"],
            r["establishDate"], logo_id, r["field"], r["subField"], r["tags"],
            source_company_id)
    conn.close()

    return source_company_id
Example #3
0
def get_cf_pic_id(sourceCfId, sourceId, pic_url):
    conn = db.connect_torndb()
    source_cf_pic = conn.get(
        "select * from source_cf_document where sourceCfId=%s and sourceId=%s",
        sourceCfId, sourceId)
    if source_cf_pic == None or source_cf_pic['link'] == None or source_cf_pic[
            'link'] == "":
        image_value = my_request.get_image(logger, pic_url)
        pic_id = imgfs.put(image_value,
                           content_type='jpeg',
                           filename='%s_%s.jpg' % (sourceCfId, sourceId))
        # logger.info("gridfs logo_id=%s" % logo_id)
    else:
        pic_id = source_cf_pic['link']

    conn.close()
    return pic_id
def save(r):
    investor_key, investor_name, logo, website, stageStr, fieldsStr, desc = r
    conn = db.connect_torndb()
    source_investor = conn.get(
        "select * from source_investor where source=%s and sourceId=%s",
        SOURCE, investor_key)
    logo_id = None
    if source_investor == None or source_investor[
            "logo"] == None or source_investor["logo"] == "":
        if logo is not None and logo != "":
            image_value = None
            retries = 0
            while image_value is None:
                image_value = my_request.get_image(logger, logo, agent=True)
                retries += 1
                if retries >= 5:
                    break
            if image_value is not None:
                logo_id = imgfs.put(image_value,
                                    content_type='jpeg',
                                    filename='investor_%s_%s.jpg' %
                                    (SOURCE, investor_key))
                logger.info("gridfs logo_id=%s" % logo_id)
    else:
        logo_id = source_investor["logo"]

    if source_investor is None:
        sql = "insert source_investor(name,website,description,logo,stage,field,type, \
        source,sourceId,createTime,modifyTime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())"

        source_investor_id = conn.insert(sql, investor_name, website, desc,
                                         logo_id, stageStr, fieldsStr, 10020,
                                         SOURCE, investor_key)
    else:
        source_investor_id = source_investor["id"]
        sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s,\
        field=%s,type=%s,modifyTime=now() where id=%s"

        conn.update(sql, investor_name, website, desc, logo_id, stageStr,
                    fieldsStr, 10020, source_investor_id)

    conn.close()
Example #5
0
def parseCompany(source, company_key):
    logger.info("*****************************************")
    logger.info("parseComany, company_key=%s" % company_key)
    try:
        item = fromdb.company.find_one({"source":source, "company_key":company_key})
        if item is None:
            return

        html = item["content"]
        #doc = lxml.html.fromstring(html)
        d = pq(html)

        company_short_name = ""
        product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip()
        temps = product_name.split("/",1)
        if len(temps) == 2:
            product_name = temps[0].strip()
            company_short_name = temps[1].strip()
        if company_short_name == "":
            company_short_name = product_name
        logger.info("product name: " + product_name)
        logger.info("company short name: " + company_short_name)

        company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","")
        if company_name == "暂无" or company_name == "暂未收录":
            company_name = ""
        company_name = util.norm_company_name(company_name)
        logger.info("company name: " + company_name)

        website = d('div.link-line> a.weblink').attr("href").strip()
        if website=="http://%e6%9a%82%e6%97%a0":
            website = ""
        logger.info("website: " + website)

        if company_short_name == "" and company_name == "" and website == "":
            return

        establish_date = None
        str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","")
        result = util.re_get_result('(\d*?).(\d*?)$',str)
        if result != None:
            (year, month) = result
            establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
        logger.info("establish date: %s", establish_date)

        locationId=0
        str = d('span.loca').text().strip()
        #logger.info(str)
        result = util.re_get_result(u'(.*?)·(.*?)$',str)
        if result != None:
            (province, city) = result
            province = province.strip()
            city = city.strip()
            logger.info("location: %s-%s" % (province, city))

            locationId = 0
            result = conn.get("select * from location where locationName=%s", city)
            if result != None:
                locationId = result["locationId"]
            else:
                result = conn.get("select * from location where locationName=%s", province)
                if result != None:
                    locationId = result["locationId"]

        logger.info("locationId: %d" % locationId)

        company_status = 2010
        str = d('div.des-more> div').eq(2).text().strip()
        if str == "已关闭":
            company_status = 2020
        logger.info("company_status: %d" % company_status)

        funding_type = 0
        str = d("span.tag.bg-c").text().strip()
        logger.info(str)
        if str == "融资需求 · 需要融资":
            funding_type = 8020
        elif str == "融资需求 · 寻求收购":
            funding_type = 8020
        logger.info("funding_type=%d" % funding_type)

        field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
        logger.info("field: " + field)

        sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
        logger.info("sub field: " + sub_field)

        tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",")
        logger.info(tags)

        desc = d("div.des").text().strip()
        logger.info("desc: " + desc)

        #logo
        logo_id = None
        source_company = conn.get("select * from source_company where source=%s and sourceId=%s", source, company_key)
        if source_company == None or source_company["logo"] == None or source_company["logo"] == "":
            log_url = d("div.pic >img").attr("src")
            if log_url is not None and len(log_url.strip()) > 0:
                logger.info(log_url)
                image_value = my_request.get_image(logger,log_url)
                if image_value != None:
                    logo_id = imgfs.put(image_value, content_type='jpeg', filename='company_%s_%s.jpg' % (source, company_key))
                    pass
        else:
            logo_id = source_company["logo"]
        logger.info("gridfs logo_id=%s" % logo_id)

        if source_company == None:
            source_company_id = conn.insert("insert source_company(name,fullName,description,brief,\
                        round,roundDesc,companyStatus,fundingType,locationId,establishDate,logo,\
                        source,sourceId,createTime,modifyTime,\
                        field,subField,tags) \
                        values(%s,%s,%s,%s,\
                        %s,%s,%s,%s,%s,%s,%s,\
                        %s,%s,now(),now(),\
                        %s,%s,%s)",
                        product_name, company_name, desc, '',
                        0,'',company_status,funding_type,locationId,establish_date,logo_id,
                        SOURCE,company_key,
                        field,sub_field,",".join(tags)
                        )
        else:
            source_company_id = source_company["id"]
            conn.update("update source_company set \
                        name=%s,fullName=%s,description=%s, \
                        companyStatus=%s,fundingType=%s,locationId=%s,establishDate=%s,logo=%s, \
                        field=%s,subField=%s,\
                        modifyTime=now() \
                        where id=%s",
                        product_name, company_name, desc,
                        company_status,funding_type,locationId,establish_date,logo_id,
                        field,sub_field,
                        source_company_id
                        )

        #artifact
        logger.info("*** artifact ***")
        lis = d('ul.list-prod> li> a')
        for li in lis:
            l = pq(li)
            type = l('h4> span').text().strip()
            if type == "网站":
                link = l.attr("href").strip()
                name = l('h4> b').text().strip()
                desc = l('p').text().strip()
                logger.info("name: %s, link: %s, desc: %s" % (name,link,desc))
                if link == "":
                    continue
                link = util.norm_url(link)
                source_artifact = conn.get("select * from source_artifact where sourceCompanyId=%s and type=4010 and link=%s",
                        source_company_id, link)
                if source_artifact is None:
                    sql = "insert source_artifact(sourceCompanyId,`name`,`description`,`link`,`type`,createTime,modifyTime) \
                          values(%s,%s,%s,%s,4010,now(),now())"
                    conn.insert(sql, source_company_id,name,desc,link)

        if website != "":
            source_artifact = conn.get("select * from source_artifact where sourceCompanyId=%s and type=4010 and link=%s",
                            source_company_id, website)
            if source_artifact is None:
                sql = "insert source_artifact(sourceCompanyId,name,description,link,type,createTime,modifyTime) \
                      values(%s,%s,%s,%s,4010,now(),now())"
                logger.info("name: %s, link: %s, desc: %s" % (product_name,website,desc))
                conn.insert(sql,source_company_id,product_name,desc,website)

        #footprint
        logger.info("*** footprint ***")
        lis = d('ul.list-milestone> li')
        for li in lis:
            l = pq(li)
            footDesc = l('p').eq(0).text().strip()
            if footDesc is None or footDesc == "":
                continue
            footDateText = l('p> span').text().strip()
            if footDateText is None or footDateText == "":
                continue
            result = util.re_get_result('(\d*?)\.(\d*?)$',footDateText)
            if result == None:
                continue
            (year, month) = result
            year = int(year)
            try:
                month = int(month)
            except:
                month = 1

            if month<=0 or month>12:
                month = 1
            if year < 1970 or year > 3000:
                year = 1970
            footDate = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
            logger.info(footDate)
            logger.info(footDesc)

            fp = conn.get("select * from source_footprint where sourceCompanyId=%s and footDate=%s and description=%s",
                              source_company_id, footDate, footDesc)
            if fp == None:
                conn.insert("insert source_footprint(sourceCompanyId,footDate,description,createTime,modifyTime) \
                            values(%s,%s,%s,now(),now())",
                            source_company_id, footDate, footDesc)

        # funding
        logger.info("*** funding ***")
        lis = d('table.list-round-v2> tr')
        for li in lis:
            l = pq(li)
            dateStr = l('td> span.date').text().strip()
            result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$',dateStr)
            fundingDate = None
            if result != None:
                (year, month, day) = result
                fundingDate = datetime.datetime.strptime("%s-%s-%s" % (year,month,day), '%Y-%m-%d')
            logger.info(fundingDate)

            roundStr = l('td.mobile-none> span.round> a').text().strip().replace("轮","")
            logger.info(roundStr)
            fundingRound = 0
            if roundStr.startswith("种子"):
                fundingRound = 1010
                roundStr = "天使"
            elif roundStr.startswith("天使"):
                fundingRound = 1010
            elif roundStr.startswith("Pre-A"):
                fundingRound = 1020
            elif roundStr.startswith("A"):
                fundingRound = 1030
            elif roundStr.startswith("B"):
                fundingRound = 1040
            elif roundStr.startswith("Pre-B"):
                fundingRound = 1040
            elif roundStr.startswith("C"):
                fundingRound = 1050
            elif roundStr.startswith("D"):
                fundingRound = 1060
            elif roundStr.startswith("E"):
                fundingRound = 1070
            elif roundStr.startswith("F"):
                fundingRound = 1100
            elif roundStr.startswith("IPO"):
                fundingRound = 1110
            elif roundStr.startswith("收购"):
                fundingRound = 1120
            logger.info("fundingRound=%d" % fundingRound)

            moneyStr = l('td> span.finades> a').text().strip()
            (currency, investment, precise) = parseMoney(moneyStr)
            logger.info("%s - %s - %s" % (currency, investment, precise))

            source_funding = conn.get("select * from source_funding where sourceCompanyId=%s and roundDesc=%s",
                                          source_company_id, roundStr)
            if source_funding == None:
                source_funding_id = conn.insert("insert source_funding(sourceCompanyId,investment,round,roundDesc, currency, precise, fundingDate,createTime,modifyTime) \
                                                values(%s,%s,%s,%s,%s,%s,%s,now(),now())",
                                                source_company_id, investment, fundingRound, roundStr,
                                                currency, precise,fundingDate)
            else:
                source_funding_id = source_funding["id"]
                conn.update("update source_funding set investment=%s,currency=%s, precise=%s, fundingDate=%s, modifyTime=now() \
                            where id=%s",
                            investment, currency, precise, fundingDate, source_funding_id
                                )

            hs = l('td:eq(3)> a')
            for h in hs:
                h = pq(h)
                investor_name = h.text().strip()
                investor_url = h.attr("href").strip()
                (investor_key,) = util.re_get_result(r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url)
                logger.info(investor_name)
                logger.info(investor_url)
                logger.info(investor_key)

                item = fromdb.investor.find_one({"source":source, "investor_key":investor_key})
                inv = parseInvestor(item)

                if inv is not None:
                    (name, logo, website, stage, field, desc) = inv
                    source_investor = conn.get("select * from source_investor where source=%s and sourceId=%s",
                                               source, investor_key)
                    logo_id = None
                    if source_investor == None or source_investor["logo"] == None or source_investor["logo"] == "":
                        if logo is not None and logo != "":
                            image_value = my_request.get_image(logger,logo)
                            logo_id = imgfs.put(image_value, content_type='jpeg', filename='investor_%s_%s.jpg' % (source, investor_key))
                            logger.info("gridfs logo_id=%s" % logo_id)
                    else:
                        logo_id = source_investor["logo"]

                    if source_investor is None:
                        sql = "insert source_investor(name,website,description,logo,stage,field,type, \
                        source,sourceId,createTime,modifyTime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())"
                        source_investor_id = conn.insert(sql,
                            name,website,desc,logo_id,stage,field,10020,source,investor_key)
                    else:
                        source_investor_id = source_investor["id"]
                        sql = "update source_investor set name=%s,website=%s,description=%s,logo=%s,stage=%s,\
                        field=%s,type=%s,modifyTime=now() where id=%s"
                        conn.update(sql,
                            name,website,desc,logo_id,stage,field,10020, source_investor_id)

                    source_funding_investor_rel = conn.get("select * from source_funding_investor_rel where \
                            sourceFundingId=%s and sourceInvestorId=%s",
                            source_funding_id, source_investor_id)
                    if source_funding_investor_rel is None:
                        conn.insert("insert source_funding_investor_rel(sourceFundingId, sourceInvestorId, \
                                    createTime,modifyTime) \
                                    values(%s,%s, now(),now())", source_funding_id, source_investor_id)

        # members
        logger.info("*** member ****")
        lis = d('ul.list-prodcase> li')
        for li in lis:
            l = pq(li)
            member_name = l('h4> a> b> span.c').text().strip()
            position = l('h4> a> b> span.c-gray').text().strip()
            str = l('h4> a').attr("href").strip()
            (member_key,) = util.re_get_result(r'person/(\d*?)$',str)
            logger.info("member_key: %s, member_name: %s, position: %s" % (member_key, member_name, position))

            item = fromdb.member.find_one({"source":source, "member_key":member_key})
            m = parseMember(item)

            if m is not None:
                (weibo, introduction, education, work, location, role, pictureUrl) = m

                source_member = conn.get("select * from source_member where source=%s and sourceId=%s",
                                                   source, member_key)
                logo_id = None
                if source_member == None or source_member["photo"] == None or source_member["photo"] == "":
                    if pictureUrl is not None and pictureUrl != "":
                        image_value = my_request.get_image(logger,pictureUrl)
                        logo_id = imgfs.put(image_value, content_type='jpeg', filename='member_%s_%s.jpg' % (source, member_key))
                        logger.info("gridfs logo_id=%s" % logo_id)
                else:
                    logo_id = source_member["photo"]

                if source_member is None:
                    sql = "insert source_member(name,photo,weibo,location,role,description,\
                    education,work,source,sourceId,createTime,modifyTime) \
                    values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now())"
                    source_member_id = conn.insert(sql,
                        member_name,logo_id,weibo,location,role,introduction,
                        education,work,source,member_key)
                else:
                    source_member_id = source_member["id"]
                    sql = "update source_member set name=%s,photo=%s,weibo=%s,location=%s,role=%s,description=%s,\
                    education=%s,work=%s,modifyTime=now() where id=%s"
                    conn.update(sql,
                        member_name,logo_id,weibo,location,role,introduction,
                        education,work,source_member_id)

                source_company_member_rel = conn.get("select * from source_company_member_rel where \
                        sourceCompanyId=%s and sourceMemberId=%s",
                        source_company_id, source_member_id)
                if source_company_member_rel is None:
                    conn.insert("insert source_company_member_rel(sourceCompanyId, sourceMemberId, \
                                position,type,createTime,modifyTime) \
                                values(%s,%s,%s,%s, now(),now())",
                                source_company_id, source_member_id,position,0)

        #news
        logger.info("*** news ***")
        lis = d('ul.list-news> li')
        for li in lis:
            try:
                l = pq(li)
                news_url = l('p.title> a').attr("href").strip()
                (news_key,) = util.re_get_result(r"http://www.itjuzi.com/overview/news/(\d*)$", news_url)

                item = fromdb.news.find_one({"source":source, "company_key":company_key, "news_key":news_key})
                parseNews(item)
            except Exception,ex:
                logger.exception(ex)

        msg = {"type":"company", "id":source_company_id}
        kafkaProducer.send_messages("parser_v2", json.dumps(msg))