コード例 #1
0
def patch_company_location(company_id):
    conn = db.connect_torndb_proxy()
    company1 = conn.get("select * from company where id=%s", company_id)
    if company1["corporateId"] is not None:
        corporate = conn.get("select * from corporate where id=%s",
                             company1["corporateId"])

        if corporate is not None and (corporate["locationId"] is None
                                      or corporate["locationId"] == 0):
            locationId = None

            alias0 = [{
                "name": corporate["fullName"]
            }] if corporate["fullName"] is not None else []
            aliases = conn.query(
                "select * from corporate_alias where corporateId=%s and "
                "(active is null or active ='Y') and verify='Y'",
                company1["corporateId"])
            for alias in alias0 + aliases:
                logger.info(alias["name"])
                loc1, loc2 = name_helper.get_location_from_company_name(
                    alias["name"])
                logger.info("%s/%s", loc1, loc2)
                if loc1 is not None:
                    l = conn.get("select *from location where locationName=%s",
                                 loc1)
                    if l:
                        locationId = l["locationId"]
                        break
            if locationId is not None:
                conn.update("update corporate set locationId=%s where id=%s",
                            locationId, company1["corporateId"])
    conn.close()
コード例 #2
0
def patch_company_location(company_id):
    conn = db.connect_torndb()
    company1 = conn.get("select * from company where id=%s", company_id)
    if company1["locationId"] is None or company1["locationId"] == 0:
        locationId = None
        scs = conn.query(
            "select * from source_company where companyId=%s and (active is null or active='Y')",
            company_id)
        for sc in scs:
            if sc["locationId"] is not None and sc["locationId"] > 0:
                locationId = sc["locationId"]
                break

        if locationId is None:
            aliases = conn.query(
                "select * from company_alias where companyId=%s and type=12010",
                company_id)
            for alias in aliases:
                loc1, loc2 = name_helper.get_location_from_company_name(
                    alias["name"])
                if loc1 is not None:
                    l = conn.get("select *from location where locationName=%s",
                                 loc1)
                    if l:
                        locationId = l["locationId"]
                        break
        if locationId is not None:
            conn.update("update company set locationId=%s where id=%s",
                        locationId, company1["id"])
    conn.close()
コード例 #3
0
def parse_company(item):
    # logger.info("parse_company")

    d = pq(html.fromstring(item['content'].decode("utf-8")))
    company_key = item["key"]

    # company basic info
    tags = []
    for tag in d('.word_list').text().split():
        if tag.strip() not in tags: tags.append(tag)

    tags_str = ",".join(tags)

    logo = d('.peoimg img').attr('src')
    if logo:
        logo = logo.replace("https://", "http://")

    establish_date = None
    time_content = d('.time_content li:last-child')
    if d(time_content)('.upword').text().find('成立') > 0:
        establish_date = d(time_content)('.time_up').text()
        establish_date = datetime.datetime.strptime(establish_date, '%Y-%m-%d')

    companyName = d('.company_div h5').text()
    city = name_helper.get_location_from_company_name(companyName)[0]
    location_id = 0
    if city != None:
        location = parser_db_util.get_location(city)
        if location != None:
            location_id = location["locationId"]

    # logger.info("locationid =%s",location_id)

    fullName = companyName.replace("_", "")
    fullName = name_helper.company_name_normalize(fullName)

    desc = d('#intro_srocll p').text()
    productDesc = ''
    website = ''
    for p in d('.procont_lis p'):
        if d(p).text().find('官网') > 0 and d(p)('a').attr('href') is not None:
            website = d(p)('a').attr('href')
            continue
        productDesc += d(p).text() + '\n'

    if desc == '' or desc is None: desc = productDesc

    shortName = d('.peo_center h4').text().split(':')[0].split(':')[0].split(
        '——')[0].split(',')[0].split('|')[0]

    companyResult = {}
    # isCompany
    # print companyName,company_key, ',', name_helper.name_check(companyName)[1], ',', len(website)>0
    if name_helper.name_check(companyName)[1] == True:
        # English name
        if name_helper.name_check(shortName)[0] == False:
            pass
        else:
            cnt = 0
            for s in shortName:
                if s in companyName: cnt += 1

            if not cnt > 2:
                shortName = companyName
    else:
        if not len(website) > 0:
            return 0
        else:
            companyResult['fakeName'] = fullName
            fullName = None

    companyResult.update({
        "name": shortName,
        "fullName": fullName,
        "description": desc,
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": establish_date,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None,
        "brief": None,
        "website": website,
    })

    return companyResult
コード例 #4
0
ファイル: evervc_company_parser.py プロジェクト: yujiye/Codes
def parse_company(item):
    # logger.info("parse_company")

    d = pq(html.fromstring(item['content'].decode("utf-8")))
    company_key = item["key"]

    # company basic info
    tags = []

    for tag in d('.portfolio-user-tag .label').text().split():
        if tag.strip() not in tags: tags.append(tag.strip())

    tags_str = ",".join(tags)

    logo = 'http:' + d('.portfolio-user-photo img').attr('src')
    if logo:
        logo = logo.replace("https://", "http://")
        logo = logo.replace("@!msgs", "")

    establish_date = None

    companyName = d('.corp-name').text()

    location_id = 0
    city = d('.portfolio-user-tag').text().split(' ')[0]
    if city != None: location = parser_db_util.get_location(city)
    if location is None:
        city = name_helper.get_location_from_company_name(companyName)[0]

    if city != None:
        location = parser_db_util.get_location(city)
        if location != None:
            location_id = location["locationId"]

    # logger.info("locationid =%s",location_id)

    fullName = companyName.replace("_", "")
    fullName = name_helper.company_name_normalize(fullName)

    # desc = d('.portfolio-corp p').text()
    desc = d('.portfolio-user-bio .text').text()
    productDesc = d('.portfolio-text').text()

    website = d('.user-contact a').text()

    if desc == '' or desc is None: desc = productDesc

    shortName = d('.portfolio-user-info h1').text()

    companyResult = {}

    companyResult.update({
        "name": shortName,
        "fullName": fullName,
        "description": desc,
        "productDesc": productDesc,
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": None,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None,
        "brief": None,
        "website": website,
    })

    return companyResult
コード例 #5
0
ファイル: itjuzi_company_parser.py プロジェクト: yujiye/Codes
def parse_base(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    company_short_name = ""
    product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip()
    if product_name is None or product_name.strip() == "":
        product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip()
    temps = product_name.split("/",1)
    if len(temps) == 2:
        product_name = temps[0].strip()
        company_short_name = temps[1].strip()
    if company_short_name == "":
        company_short_name = product_name
    logger.info("product name: %s" % product_name)
    logger.info("company short name: %s" % company_short_name)

    company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","")
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    if company_name is None or company_name.strip() == "":
        try:
            company_name = d('div.des-more> h2').text().strip()
        except:
            pass
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    company_name = name_helper.company_name_normalize(company_name)
    logger.info("company name: %s" % company_name)

    if company_short_name == "" and company_name == "":
        return

    establish_date = None
    str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","")
    result = util.re_get_result('(\d*)\.(\d*)',str)
    if result != None:
        (year, month) = result
        try:
            if int(month) > 12:
                month = "1"
        except:
            month = "1"
        establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
    logger.info("establish date: %s" % establish_date)

    locationId=0
    str = d('span.loca').text().strip()
    #logger.info(str)
    result = util.re_get_result(u'(.*?)·(.*?)$',str)
    if result != None:
        (province, city) = result
        province = province.strip()
        city = city.strip()
        logger.info("location: %s-%s" % (province, city))

        locationId = 0
        result = parser_db_util.get_location(city)
        if result != None:
            locationId = result["locationId"]
        else:
            result = parser_db_util.get_location(province)
            if result != None:
                locationId = result["locationId"]

    if locationId == 0:
        loc1,loc2 = name_helper.get_location_from_company_name(company_name)
        if loc1 is not None:
            result = parser_db_util.get_location(loc1)
            if result != None:
                locationId = result["locationId"]
    logger.info("locationId: %d" % locationId)

    company_status = 2010
    str = d('div.des-more> div').eq(2).text().strip()
    if str == "已关闭":
        company_status = 2020
    logger.info("company_status: %d" % company_status)

    funding_type = 0
    str = d("span.tag.bg-c").text().strip()
    logger.info("融资需求: %s" % str)
    if str == "融资需求 · 需要融资":
        funding_type = 8020
    elif str == "融资需求 · 寻求收购":
        funding_type = 8020
    logger.info("funding_type=%d" % funding_type)
    try:
        brief = d("h2.seo-slogan").text().strip()
    except:
        brief = ""
    logger.info("brief: %s" % brief)

    if brief.find("暂未收录"):
        brief = ""
    field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
    logger.info("field: %s" % field)

    sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
    logger.info("sub field: %s" % sub_field)

    tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",")
    logger.info("tags: %s" % tags)

    desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\
        replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip()
    logger.info("********desc: %s" % desc)

    #logo
    logo = d("div.pic >img").attr("src")
    #if logo:
    #    logo = logo.replace("http://", "https://")
    logger.info("logo: %s", logo)


    # website = d('div.link-line> a').text().strip()
    # if website is None or website == "":
    #     website = d('div.link-line> a.webTink').text().strip()
    # if website is None or website == "":
    #     try:
    #         logger.info("here")
    #         website = d('div.link-line> span.weblink> a').eq(1).text().strip()
    #         logger.info(website)
    #     except:
    #         pass
    artifacts = []
    for ty in [1,2,3]:
        if ty == 1:
            was = d('div.link-line> a')
        else:
            was = d('div.link-line> span.weblink,span.webTink> a')

        for wa in was:
            webs =[]

            try:
                website = pq(wa).attr("href").strip()
                if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:

            #     website = pq(wa).text().strip()
            except:
                pass
            try:
                website = pq(wa).text().strip()
                if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:
            #     website = pq(wa).text().strip()
            except:
                pass

            #
            # if website=="http://%e6%9a%82%e6%97%a0":
            #     website = ""
            # website = url_helper.url_normalize(website)
            # logger.info("website: %s" % website)

            # artifacts = []
            for website in webs:
                type, app_market, app_id = url_helper.get_market(website)
                if type == 4010:
                    flag, domain = url_helper.get_domain(website)
                    if flag is not None:
                        if flag is False:
                            domain = None
                        artifacts.append({
                            "type":4010,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })

                elif type == 4020:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4020,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": website
                        })

                elif type == 4030:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4030,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": None
                        })

                elif type == 4040:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                                "type":4040,
                                "name":product_name,
                                "desc":desc,
                                "link":website,
                                "domain": domain
                        })
                elif type == 4050:
                    domain = None
                    if app_market == 16010 or app_market == 16020:
                        android_app = parser_db_util.find_android_market(app_market, app_id)
                        if android_app:
                            domain = android_app["apkname"]
                    else:
                        domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type":4050,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })


    #获投状态
    roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip()
    fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr)
    logger.info("获投状态: %d, %s", fundingRound, roundStr)

    logger.info("")


    return {
        "shortName": company_short_name,
        "fullName": company_name if company_name is not None and company_name.strip() != "" else None,
        "productName": product_name,
        "description": desc,
        "brief": brief,
        "round": fundingRound,
        "roundDesc": roundStr,
        "companyStatus": company_status,
        "fundingType": funding_type,
        "locationId": locationId,
        "establishDate": establish_date,
        "logo": logo,
        "sourceId": company_key,
        "field": field,
        "subField": sub_field,
        "tags": tags,
        "type":41010,
        "artifacts":artifacts
    }