Ejemplo n.º 1
0
def save_mongo_source_company_name(source, sourceId, scndata):
    mongo = db.connect_mongo()
    collection_company = mongo.source.company

    item = populate_column(scndata, source_company_name_columns)
    chinese = hz.is_chinese_string(item["name"])
    if chinese:
        item["chinese"] = 'Y'
    else:
        item["chinese"] = 'N'
    record = collection_company.find_one({
        "source": source,
        "sourceId": sourceId
    })
    if record is not None:
        #logger.info(record)
        source_company_name = collection_company.find_one({
            "source":
            source,
            "sourceId":
            sourceId,
            "source_company_name.name":
            item["name"]
        })
        if source_company_name is None:
            collection_company.update_one(
                {"_id": record["_id"]},
                {'$addToSet': {
                    "source_company_name": item
                }})

    mongo.close()
Ejemplo n.º 2
0
def save_source_company_name(source_company_id, name, type, nflag):
    if name is None or name.strip() == "":
        return
    chinese = hz.is_chinese_string(name)
    if chinese:
        chinese = 'Y'
    else:
        chinese = 'N'
    # new company
    if nflag == "new":
        conn = db.connect_torndb()
        n = conn.get(
            "select * from source_company_name where sourceCompanyId=%s and name=%s",
            source_company_id, name)
        if n is None:
            # chinese = hz.is_chinese_string(name)
            # if chinese:
            #     chinese = 'Y'
            # else:
            #     chinese = 'N'
            conn.insert(
                "insert source_company_name(sourceCompanyId,name,type,chinese,createTime,modifyTime) values( \
                        %s,%s,%s,%s,now(),now())", source_company_id, name,
                type, chinese)
        conn.close()
    #Save all into mongo
    scname = {"name": name, "type": type, "chinese": chinese}
    parser_mongo_util.save_mongo_source_company_name(source_company_id, scname)
Ejemplo n.º 3
0
def get_short_name(unicode_name):
    name = unicode_name.strip()
    if name.find(u"-") > 0:
        name = name.split(u"-")[0].strip()
    if name.find(u"—") > 0:
        name = name.split(u"—")[0].strip()
    if name.find(u"-") > 0:
        name = name.split(u"-")[0].strip()
    if hz.is_chinese_string(name):
        if name.find(u" ") > 0:
            name = name.split(u" ")[0].strip()
    if name.find(u"|") > 0:
        name = name.split(u"|")[0].strip()
    if name.find(u"·") > 0:
        name = name.split(u"·")[0].strip()

    return name
Ejemplo n.º 4
0
def save_source_company_name(source_company_id, name, type):
    if name is None or name.strip() == "":
        return

    conn = db.connect_torndb()
    n = conn.get(
        "select * from source_company_name where sourceCompanyId=%s and name=%s",
        source_company_id, name)
    if n is None:
        chinese = hz.is_chinese_string(name)
        if chinese:
            chinese = 'Y'
        else:
            chinese = 'N'
        conn.insert(
            "insert source_company_name(sourceCompanyId,name,type,chinese,createTime,modifyTime) values( \
                    %s,%s,%s,%s,now(),now())", source_company_id, name, type,
            chinese)
    conn.close()
Ejemplo n.º 5
0
def english_name_check(unicode_name):
    chinese = hz.is_chinese_string(unicode_name)
    english = False
    company = None
    if not chinese:
        english = True
        company = False
        n = unicode_name.strip().lower()
        if n.endswith("ltd") or \
            n.endswith("ltd.") or \
            n.endswith("inc") or \
            n.endswith("inc.") or \
            n.endswith("llc") or \
            n.endswith("llc.") or \
            n.endswith("limited") or \
            n.endswith("corporation") or \
            n.endswith("company"):
            company = True

    return english, company
Ejemplo n.º 6
0
def name_check(unicode_name):
    chinese = hz.is_chinese_string(unicode_name)
    company = None
    if chinese:
        company = False
        if unicode_name.find(u"公司") > 0:
            company = True
        elif unicode_name.find(u"企业") > 0:
            company = True
        elif unicode_name.find(u"中心") > 0:
            company = True
        elif unicode_name.find(u"事务所") > 0:
            company = True
        elif unicode_name.find(u"研究院") > 0:
            company = True
        elif unicode_name.find(u"会社") > 0:
            company = True
        elif unicode_name.find(u"合伙") > 0:
            company = True
        elif unicode_name.endswith(u"所") > 0:
            company = True
        elif unicode_name.endswith(u"部") > 0:
            company = True
        elif unicode_name.endswith(u"会") > 0:
            company = True
        elif unicode_name.endswith(u"院") > 0:
            company = True
        elif unicode_name.endswith(u"社") > 0:
            company = True
        elif unicode_name.endswith(u"店") > 0:
            company = True
        elif unicode_name.endswith(u"馆") > 0:
            company = True
        elif unicode_name.endswith(u"室") > 0:
            company = True
        elif unicode_name.endswith(u"厂") > 0:
            company = True
        elif unicode_name.endswith(u"楼") > 0:
            company = True
    return chinese, company
Ejemplo n.º 7
0
def check_desc(content, length=5):
    if content is None or content.strip() == "":
        return False

    if len(content) <= length:
        return False

    if hz.is_chinese_string(content):
        if float(len(set(content)))/float(len(content)) <= 0.1:
            num_chinese = count_chinese(content)
            #logger.info("%s->%s",len(set(content)),num_chinese)
            if len(set(content))-num_chinese <= 20:
                return False
    else:
        #logger.info(len(set(content)))
        if len(content) <= 10:
            return False

        if len(set(content)) <= 20:
            if float(len(set(content)))/float(len(content)) <= 0.28:
                return False
    return True
Ejemplo n.º 8
0
def find_company_by_short_name(source_company, test=False):
    #产品名相同,则判断
    #1. 地区相同
    #2. 成立日期相同
    #3. member有相同
    #4. 融资事件

    table_names = helper.get_table_names(test)

    logger.info("find_company_by_short_name")
    matched_company_id = None

    conn = db.connect_torndb()
    source_members = list(conn.query("select m.* from source_company_member_rel r join source_member m on m.id=r.sourceMemberId where r.sourceCompanyId=%s", source_company["id"]))

    source_investor_ids = {}
    source_fundings = list(conn.query("select * from source_funding where sourceCompanyId=%s",source_company["id"]))
    for sf in source_fundings:
        rels = list(conn.query("select * from source_funding_investor_rel where sourceFundingId=%s",sf["id"]))
        for rel in rels:
            source_investor = conn.get("select * from source_investor where id=%s", rel["sourceInvestorId"])
            if source_investor["investorId"] is not None:
                source_investor_ids[source_investor["investorId"]] = 1

    short_names = list(conn.query("select * from source_company_name where type=12020 and sourceCompanyId=%s", source_company["id"]))
    sns = []
    for s in short_names:
        sns.append(s["name"])

    if source_company["name"] not in sns:
        sns.append(source_company["name"])

    for short_name in sns:
        if short_name is None or short_name.strip() == "":
            continue
        short_name = short_name.strip()

        logger.info("short_name: %s", short_name)
        candidate_company_ids = []
        cs = list(conn.query("select * from " + table_names["company"] +
                             " where name=%s and (active is null or active !='N')", short_name))
        for c in cs:
            company_id = c["id"]
            candidate_company_ids.append(company_id)

        aliases = list(conn.query("select a.companyId from " + table_names["company_alias"] +
                                  " a join " + table_names["company"] + " c on c.id=a.companyId " +
                                  "where (c.active is null or c.active!='N') and a.name=%s",short_name))
        for alias in aliases:
            company_id = alias["companyId"]
            candidate_company_ids.append(company_id)

        for company_id in candidate_company_ids:
            company = conn.get("select * from " + table_names["company"] + " where id=%s and (active is null or active='Y')", company_id)
            if company is None:
                continue

            #地区
            location1 = source_company["locationId"]
            location2 = company["locationId"]
            if location1 > 0 and location1==location2:
                matched_company_id = company_id
                logger.info("find_company_by_short_name, location")
                break

            #成立日期
            date1 = source_company["establishDate"]
            date2 = company["establishDate"]
            if date1 is not None and date2 is not None and \
                date1.year==date2.year and date1.month==date2.month:
                matched_company_id = company_id
                logger.info("find_company_by_short_name, establish date")
                break

            #member
            members = list(conn.query("select m.* from " + table_names["company_member_rel"] +
                                      " r join " + table_names["member"] + " m on m.id=r.memberId where r.companyId=%s", company_id))
            for member in members:
                member_name = member["name"]
                logger.info("member_name: %s", member_name)
                if member_name is None or member_name == "":
                    continue
                if not hz.is_chinese_string(member_name):
                    continue
                for source_member in source_members:
                    #logger.info("source_member_name: %s", source_member["name"])
                    if member_name == source_member["name"]:
                        matched_company_id = company_id
                        logger.info("find_company_by_short_name, member")
                        break
                if matched_company_id is not None:
                    break
            if matched_company_id is not None:
                    break

            # gongshang member
            # TODO

            #funding
            fundings = list(conn.query("select * from " + table_names["funding"] + " where companyId=%s",company_id))
            for f in fundings:
                rels = list(conn.query("select * from " + table_names["funding_investor_rel"] + " where fundingId=%s",f["id"]))
                for rel in rels:
                    if source_investor_ids.has_key(rel["investorId"]):
                        matched_company_id = company_id
                        logger.info("find_company_by_short_name, funding")
                        break
                if matched_company_id is not None:
                    break

            if matched_company_id is not None:
                break
        if matched_company_id is not None:
            break
    conn.close()
    return matched_company_id
Ejemplo n.º 9
0
def find_company_by_short_name(company):
    #产品名相同,则判断
    #1. 地区相同
    #2. 成立日期相同
    #3. member有相同
    #4. 融资事件
    global caflag
    logger.info("find_company_by_short_name")
    matched_company_id = None

    conn = db.connect_torndb()
    members = list(
        conn.query(
            "select m.* from company_member_rel r join member m on m.id=r.memberId where r.companyId=%s",
            company["id"]))

    investor_ids = {}
    fundings = list(
        conn.query("select * from funding where companyId=%s", company["id"]))
    for f in fundings:
        rels = list(
            conn.query("select * from funding_investor_rel where fundingId=%s",
                       f["id"]))
        for rel in rels:
            investor_ids[rel["investorId"]] = 1

    sns = []
    #add company_alias into checking list
    if caflag is True:
        short_names = list(
            conn.query(
                "select * from company_alias where type=12020 and companyId=%s",
                company["id"]))
        for s in short_names:
            sns.append(s["name"])

    if company["name"] not in sns:
        sns.append(company["name"])

    for short_name in sns:
        if short_name is None or short_name.strip() == "":
            continue
        short_name = short_name.strip()

        logger.info("short_name: %s", short_name)
        candidate_company_ids = []
        cs = list(
            conn.query(
                "select * from company where name=%s and (active is null or active !='N') and id!=%s order by id desc ",
                short_name, company["id"]))
        for c in cs:
            company_id = c["id"]
            candidate_company_ids.append(company_id)

        #add caflag into checking list
        if caflag is True:
            aliases = list(
                conn.query(
                    "select a.companyId from company_alias a join company c on c.id=a.companyId where (c.active is null or c.active!='N') and a.name=%s and c.id!=%s",
                    short_name, company["id"]))
            for alias in aliases:
                company_id = alias["companyId"]
                if company_id in candidate_company_ids:
                    continue
                candidate_company_ids.append(company_id)

        # sort id
        candidate_company_ids.sort(reverse=True)
        #logger.info("candidate companies id: %s", candidate_company_ids)

        for company_id in candidate_company_ids:
            company_candidate = conn.get(
                "select * from company where id=%s and (active is null or active='Y')",
                company_id)
            if company_candidate is None:
                continue

            #地区
            location1 = company["locationId"]
            location2 = company_candidate["locationId"]
            if location1 > 0 and location1 == location2:
                matched_company_id = company_id
                logger.info("find_company_by_short_name, location")
                break

            #成立日期
            date1 = company["establishDate"]
            date2 = company_candidate["establishDate"]
            if date1 is not None and date2 is not None and \
                date1.year==date2.year and date1.month==date2.month:
                matched_company_id = company_id
                logger.info("find_company_by_short_name, establish date")
                break

            #member
            members_candidate = list(
                conn.query(
                    "select m.* from company_member_rel r join member m on m.id=r.memberId where r.companyId=%s",
                    company_id))
            for member_candidate in members_candidate:
                member_name = member_candidate["name"]
                #logger.info("member_name: %s", member_name)
                if member_name is None or member_name == "":
                    continue
                if not hz.is_chinese_string(member_name):
                    continue
                for member in members:
                    #logger.info("source_member_name: %s", source_member["name"])
                    if member_name == member["name"]:
                        matched_company_id = company_id
                        logger.info("find_company_by_short_name, member")
                        break
                if matched_company_id is not None:
                    break
            if matched_company_id is not None:
                break

            # gongshang member
            # TODO

            #funding
            fundings_candidate = list(
                conn.query("select * from funding where companyId=%s",
                           company_id))
            for fc in fundings_candidate:
                rels = list(
                    conn.query(
                        "select * from funding_investor_rel where fundingId=%s",
                        fc["id"]))
                for rel in rels:
                    if investor_ids.has_key(rel["investorId"]):
                        matched_company_id = company_id
                        logger.info("find_company_by_short_name, funding")
                        break
                if matched_company_id is not None:
                    break

            if matched_company_id is not None:
                break
        if matched_company_id is not None:
            break
    conn.close()
    return matched_company_id
Ejemplo n.º 10
0
def process(url, key, content):
    global LATEST
    if content.find('360安全中心') == -1:
        return

    #logger.info(content)

    r = "var detail = \(function \(\) \{\s*?return\s*?(.*?);\s*?\}\)"
    result = util.re_get_result(r, content)
    (b, ) = result
    base = json.loads(b.replace("'", '"'), strict=False)
    name = base["sname"]
    type = base["type"]
    package = base["pname"].strip()
    #logger.info("%s, %s, %s" % (type, name, package))

    d = pq(html.fromstring(content.decode("utf-8")))
    desc = ""
    try:
        # desc = d('div.breif').contents()[0].strip()
        desc = d('div.breif').text().strip()
        ts = desc.split("【基本信息】")
        desc = ts[0].strip()
    except:
        pass
    if desc == "":
        try:
            desc = d('div#html-brief').text().strip()
        except:
            pass

    #logger.info(desc)

    author = d('div.base-info> table> tbody> tr> td').eq(
        0).contents()[1].strip()
    chinese, is_company = name_helper.name_check(author)
    if chinese and is_company:
        author = name_helper.company_name_normalize(author)
    author = None

    #logger.info(author)
    modify_date_str = d('div.base-info> table> tbody> tr> td').eq(
        1).contents()[1].strip()
    #logger.info(modify_date_str)
    modify_date = datetime.datetime.strptime(modify_date_str, "%Y-%m-%d")
    #logger.info(modify_date)
    versionname = None
    try:
        versionname = d('div.base-info> table> tbody> tr> td').eq(
            2).contents()[1].strip()
        if versionname.startswith("V"):
            versionname = versionname.replace("V", "")
    except:
        pass
    #logger.info(versionname)
    compatibility = d('div.base-info> table> tbody> tr> td').eq(
        3).contents()[1].strip()
    language = d('div.base-info> table> tbody> tr> td').eq(
        4).contents()[1].strip()

    if language == "其他":
        if hz.is_chinese_string(desc):
            language = "中文"
    #logger.info(language)

    icon = d('div#app-info-panel> div> dl> dt >img').attr("src").strip()
    #logger.info(icon)

    screenshots = []
    try:
        screenshots = d('div#scrollbar').attr("data-snaps").split(",")
    except:
        pass

    commentbyeditor = None
    r = "<p><strong>【小编点评】</strong>(.*?)</p>"
    result = util.re_get_result(r, content)
    if result:
        (commentbyeditor, ) = result

    updates = None
    r = "<br/><b>【更新内容】</b><br/>(.*?)</div>"
    result = util.re_get_result(r, content)
    if result:
        (updates, ) = result
        updates = updates.replace("<br />", "\n").strip()

    tags = d("div.app-tags> a").text().replace(" ", ",")

    size = None
    r = "'size':'(.*?)'"
    result = util.re_get_result(r, content)
    if result:
        (size, ) = result
        size = int(size)

    downloadstr = d("span.s-3").eq(0).text().replace("下载:", "").replace(
        "次", "").replace("+", "").strip()
    download = None
    try:
        if downloadstr.endswith("千"):
            download = float(downloadstr.replace("千", "")) * 1000
        elif downloadstr.endswith("万"):
            download = float(downloadstr.replace("万", "")) * 10000
        elif downloadstr.endswith("亿"):
            download = float(downloadstr.replace("亿", "")) * 10000 * 10000
        else:
            download = int(downloadstr)
        score = float(d("span.s-1").text().replace("分", "").strip()) * 0.5
    except:
        traceback.print_exc()

    item = {
        "link": url,
        "apkname": package,
        "appmarket": APPMARKET,
        "name": name,
        "brief": None,
        "website": None,
        "description": desc,
        "commentbyeditor": commentbyeditor,
        "updateDate": modify_date,
        "language": language,
        "tags": tags,
        "version": versionname,
        "updates": updates,
        "size": size,
        "compatibility": compatibility,
        "icon": icon,
        "author": author,
        "screenshots": screenshots,
        "type": type,
        "key": str(key),
        "key_int": key,
        "download": download,
    }
    logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

    android.save(collection, APPMARKET, item)
    android.merge(item)

    if LATEST < key:
        LATEST = key