Esempio n. 1
0
def classify_android_black():

    global logger_yl, producer_tag
    init_kafka()
    db = dbcon.connect_torndb()
    # for cid, score in black_android_all().iteritems():
    # for cid, aid, score in recent_android_increase_rapidly_all():
    for cid, aid, score, source in dbutil.get_android_explosion(db):
        if dbutil.get_company_establish_date(db, cid).year < 2006:
            continue
        try:
            # 309126 下载激增
            dbutil.update_company_tag(db, cid, 309126, score, "Y")
            dbutil.mark_android_explosion(db, aid)
            dbutil.update_company_tag_comment(db, cid, 309126, 30, aid, source)
            msg = u'%s旗下Android产品近期下载量激增' % dbutil.get_company_name(db, cid)
            dbutil.update_continuous_company_message(db, cid, msg, 3201, 30,
                                                     aid, 14, source)
            producer_msg = {"id": cid}
            producer_tag.send_messages("keyword_v2", json.dumps(producer_msg))
            logger_yl.info(
                'Android Explosion Artifact: company %s, artifact %s' %
                (cid, aid))
        except Exception, e:
            logger_yl.exception(
                'Failed Android Explosion Artifact: company %s, artifact %s ' %
                (cid, aid))
Esempio n. 2
0
    def get_similar(self, cid):

        global simi_threshold, complete_threshold

        # pooling
        if cid in self.id2in:
            vec = self.model[self.corpus[self.id2in[cid]]]
            simis = sorted(self.simi[vec],
                           key=lambda x: -x[1])[1:self.num_candidates]
            simis = map(lambda x: (self.in2id[x[0]], round(x[1], 2)), simis)
        else:
            simis = self.get_similar4new(cid)

        # discount
        establish = dbutil.get_company_establish_date(self.db, cid).year
        simis = [(cid2, weight * self.__discount_year(establish, cid2))
                 for (cid2, weight) in simis]

        # sort and filter
        simis = sorted(simis, key=lambda x: -x[1])
        simis = filter(
            lambda x: dbutil.get_company_score(self.db, x[0]) >
            complete_threshold and x[1] > self.min_similarity_threshold, simis)

        # dump and exit
        self.mongo.comps.candidates.update({'company': cid}, {
            '$set': {
                'candidates': simis,
                'modifyTime': datetime.datetime.now()
            }
        }, True)
        return simis
Esempio n. 3
0
def __source_database(db, mongo, yesterday, day_seven):

    aggregates = [
        item.get('newCorporateIds', []) for item in
        mongo.task.corporate_decompose.find({'modifyTime': {
            '$gt': day_seven
        }})
    ]
    aggregates = set(chain(*aggregates))
    for c in db.query(
            'select company.id id, source_company.source source from company, source_company '
            'where company.createTime>%s and company.modifyTime>%s and company.id=source_company.companyId '
            'and (company.active is null or company.active="Y") and '
            '(source_company.active is null or source_company.active="Y");',
            day_seven, yesterday):
        if dbutil.get_company_round(db, c.id) > 1040:
            continue
        if dbutil.get_company_establish_date(db, c.id).year < 2016:
            continue
        if dbutil.get_company_corporate_id(db, c.id) in aggregates:
            continue
        if dbutil.get_company_source(db, c.id) == set([13050]):
            continue
        dbutil.update_extract_source_company(db,
                                             67001,
                                             c.source,
                                             c.id,
                                             only_insert=False)
Esempio n. 4
0
    def __extract_11120(self, cid):

        # 早期公司,小于等于B轮,成立时间在2010年之后
        if 0 <= dbutil.get_company_round(self.db, cid) <= 1040 \
                and dbutil.get_company_establish_date(self.db, cid).year >= 2010:
            yield 579089
        # 公司状态,融资中 等
        status = dbutil.get_company_status(self.db, cid)
        if status in (2010, 2020, 2025):
            yield {
                2010: 589014,
                2015: 589015,
                2020: 589016,
                2025: 589017
            }[status]
Esempio n. 5
0
def __source_news(db, mongo, today, yesterday):

    bad_news = [
        r.get('companyIds', [])
        for r in mongo.article.news.find({
            'createTime': {
                '$gt': yesterday,
                '$lt': today
            },
            'processStatus': 1,
            'type': 60001,
            'features': {
                '$ne': 578362
            },
            'modifyUser': {
                '$ne': 139
            }
        })
    ]
    bad_news = set(chain(*bad_news))
    for record in mongo.article.news.find({
            'createTime': {
                '$gt': yesterday,
                '$lt': today
            },
            'processStatus': 1,
            'type': 60001,
            'features': {
                '$ne': 578362
            },
            'modifyUser': {
                '$ne': 139
            }
    }):
        for cid in record.get('companyIds', []):
            if cid == 449316 or cid == 416649:
                dbutil.update_extract_source_company(db, 67002,
                                                     record['source'], cid,
                                                     record['_id'], False)
            if cid in bad_news:
                continue
            if dbutil.get_company_round(db, cid) > 1040:
                continue
            if dbutil.get_company_establish_date(db, cid).year < 2010:
                continue
            dbutil.update_extract_source_company(db, 67002, record['source'],
                                                 cid, record['_id'], False)
Esempio n. 6
0
    def create_single(self, db, cid):
        """
        create a single index for a particular company,
        completion id consists of its type and original id, including
            cxxxx, fxxx, axxxx, pxxxx, nxxxx, standing for company, full, artifact, product, nick
            kxxxx, keyword
        """

        # check whether to index this cid
        if not dbutil.get_company_index_type(db, cid):
            self.logger.info('should not index %s' % cid)
            return

        company = {}
        alias = set()
        company_score = dbutil.get_company_score(db, cid, 37020)
        company['ranking_score'] = company_score

        name = dbutil.get_company_name(db, cid).lower().replace(' ', '')
        code = dbutil.get_company_code(db, cid)
        company['cid'] = code
        completion = {
            'id': cid,
            '_name': name,
            '_code': code,
            '_prompt': 'name',
        }

        # First, Names
        # short name
        alias.add(name.lower())
        alias.add(''.join(lazy_pinyin(name.lower())))
        # full name
        full = dbutil.get_company_corporate_name(db, cid, False)
        if full and full.strip():
            alias.add(full.lower())
            # TODO temp solution
            alias.add(full.lower().replace(u'北京',
                                           '').replace(u'上海',
                                                       '').replace(u'深圳', ''))
        # artifact name
        aresults = dbutil.get_artifact_idname_from_cid(db, cid, True)
        if aresults:
            alias.update([
                self.valid_name(aname) for _, aname in aresults
                if self.valid_name(aname)
            ])
        # alias
        aliass = dbutil.get_alias_idname(db, cid)
        if aliass and len(aliass) < 20:
            alias.update([
                self.valid_name(aname) for _, aname in aliass
                if self.valid_name(aname)
            ])
        # corporate name
        corporate = dbutil.get_company_corporate_name(db, cid)
        if corporate and corporate.strip():
            alias.add(corporate.lower())
        # corporate full name
        corporate_full = dbutil.get_company_corporate_name(db, cid, False)
        if corporate_full and corporate_full.strip():
            alias.add(corporate_full.lower())
        # corporate alias
        corporate_alias = dbutil.get_corporate_alias(db, cid)
        if corporate_alias and len(corporate_alias) < 20:
            alias.update([
                self.valid_name(aname) for aname in corporate_alias
                if self.valid_name(aname)
            ])
        # check if there is a relevant digital coin
        dt = dbutil.get_company_digital_coin_info(db, cid)
        if dt:
            alias.add(dt.symbol.lower())
            # short name
            if dt.name:
                alias.add(dt.name.lower().replace(' ', ''))
            # english name
            if dt.enname:
                alias.add(dt.enname.lower())

        # create indice names
        completion['completionName'] = list(alias)
        company['name'] = name.lower()
        company['alias'] = self.analyze_names(alias)

        # Second, team identify, investor identify
        team = self.identifier.identify(cid)
        if team and len(team) > 0:
            company['team'] = team
        if dbutil.exist_company_tag(db, cid, 309129):
            company['investor'] = 44010

        # Third, keywords
        # regular tag
        tags_info = dbutil.get_company_tags_idname(db,
                                                   cid,
                                                   tag_out_type=(11000, 11001,
                                                                 11002))
        if tags_info:
            for tid, tname, weight in tags_info:
                company.setdefault('tags', []).append(tname.lower())
        # yellows, --> forget y take this out
        yellows = dbutil.get_company_tags_yellow(db, cid)
        if yellows:
            company['yellows'] = [yellow.lower() for yellow in yellows]

        # Forth, description
        desc = dbutil.get_company_solid_description(db, cid)
        if desc and desc.strip():
            desc = filter(lambda x: (x not in self.stopwords) and len(x) > 1,
                          list(self.seg.cut4search(desc)))
            company['description'] = (' '.join(desc)).lower()

        # Fifth, round and investors and members
        company['round'] = dbutil.get_company_round(db, cid)
        company['investors'] = dbutil.get_company_investor_names(db, cid)
        company['members'] = [
            name for _, name in dbutil.get_member_idname(db, cid)
        ]

        # Sixth, location
        lid, lname = dbutil.get_company_location(db, cid)
        company['location'] = lid

        # Seventh, establish date, create date, count of company message
        establish_date = dbutil.get_company_establish_date(db, cid)
        try:
            company['established'] = int(establish_date.strftime('%Y%m'))
        except Exception, e:
            pass
Esempio n. 7
0
    def track_topic_30(self, task):
        """
        首次媒体报道
        """

        global producer_news_task
        news = list(
            self.mongo.article.news.find({'_id':
                                          ObjectId(task['news_id'])}))[0]
        if news.get('date') and news['date'] < (
                datetime.now() - timedelta(days=self.news_timeliness)):
            return
        # 融资新闻排除
        if 578349 in news.get('features', []):
            return
        for cid in task.get('companyIds', []):
            # establish date greater than 5 years
            if dbutil.get_company_establish_date(
                    self.db,
                    cid) < (datetime.now() - timedelta(days=365 * 5)).date():
                continue
            if len(list(self.mongo.article.news.find({'companyIds':
                                                      cid}))) == 1:
                active = 'Y' if dbutil.get_topic_auto_pubilsh_status(
                    self.db, 30) == 'Y' else 'P'
                # tpm = dbutil.update_topic_message(self.db, 30, u'发现一家新公司', active, 10, task['news_id'])
                tpm = dbutil.update_topic_message(self.db, 30,
                                                  news.get('title', ''),
                                                  active, 10, task['news_id'])
                tpc = dbutil.update_topic_company(self.db, 30, cid, active)
                if tpm:
                    dbutil.update_topic_message_company(self.db, tpm, tpc)
                if active == 'Y':
                    try:
                        producer_news_task.send_messages(
                            "track_message_v2",
                            json.dumps({
                                'id': tpm,
                                'type': 'topic_message',
                                'action': 'create'
                            }))
                        producer_news_task.send_messages(
                            "track_message_v2",
                            json.dumps({
                                'id': tpc,
                                'type': 'topic_company',
                                'action': 'create'
                            }))
                    except FailedPayloadsError, fpe:
                        init_kafka()
                        producer_news_task.send_messages(
                            "track_message_v2",
                            json.dumps({
                                'id': tpm,
                                'type': 'topic_message',
                                'action': 'create'
                            }))
                        producer_news_task.send_messages(
                            "track_message_v2",
                            json.dumps({
                                'id': tpc,
                                'type': 'topic_company',
                                'action': 'create'
                            }))
Esempio n. 8
0
    def __discount_year(self, establish, cid2):

        diff = abs(
            dbutil.get_company_establish_date(self.db, cid2).year - establish)
        return self.establish_discount if diff > 5 else 1
Esempio n. 9
0
    def create_single(self, db, cid):

        global logger_universal_index
        # check whether to index this cid
        if not dbutil.get_company_index_type(db, cid):
            logger_universal_index.info('should not index %s' % cid)
            return

        company = {}
        alias, artifacts = set(), set()
        company['ranking_score'] = dbutil.get_company_score(db, cid, 37020)

        name = dbutil.get_company_name(db, cid).lower().replace(' ', '')
        code = dbutil.get_company_code(db, cid)
        company['id'] = code

        # short name
        alias.add(name.lower())
        alias.add(''.join(lazy_pinyin(name.lower())))
        # full name
        full = dbutil.get_company_corporate_name(db, cid, False)
        if full and full.strip():
            alias.add(full.lower())
            alias.add(full.lower().replace(u'北京',
                                           '').replace(u'上海', '').replace(
                                               u'深圳', '').replace(u'成都', ''))
        # artifact name
        aresults = dbutil.get_artifact_idname_from_cid(db, cid, True)
        if aresults:
            alias.update([
                self.valid_name(aname) for _, aname in aresults
                if self.valid_name(aname)
            ])
        # alias
        aliass = dbutil.get_alias_idname(db, cid)
        if aliass and len(aliass) < 20:
            alias.update([
                self.valid_name(aname) for _, aname in aliass
                if self.valid_name(aname)
            ])
        # corporate name
        corporate = dbutil.get_company_corporate_name(db, cid)
        if corporate and corporate.strip():
            alias.add(corporate.lower())
        # corporate full name
        corporate_full = dbutil.get_company_corporate_name(db, cid, False)
        if corporate_full and corporate_full.strip():
            alias.add(corporate_full.lower())
        # corporate alias
        corporate_alias = dbutil.get_corporate_alias(db, cid)
        if corporate_alias and len(corporate_alias) < 20:
            alias.update([
                self.valid_name(aname) for aname in corporate_alias
                if self.valid_name(aname)
            ])
        # check if there is a relevant digital coin
        dt = dbutil.get_company_digital_coin_info(db, cid)
        if dt:
            alias.add(dt.symbol.lower())
            # short name
            if dt.name:
                alias.add(dt.name.lower().replace(' ', ''))
            # english name
            if dt.enname:
                alias.add(dt.enname.lower())

        # create indice names
        company['name'] = name.lower()
        company['alias'] = self.analyze_names(alias)

        # tag
        tags_info = dbutil.get_company_tags_idname(db,
                                                   cid,
                                                   tag_out_type=(11000, 11001,
                                                                 11002))
        if tags_info:
            for tid, tname, weight in tags_info:
                company.setdefault('tags', []).append(tname.lower())
                company.setdefault('features', []).append(tid)
        company['nested_tag'] = []
        for industry in dbutil.get_company_industries(db, cid):
            company.setdefault('nested_tag', []).append({
                'id': industry.industryId,
                'published': industry.publishTime,
                "category": "industry"
            })
        for topic in dbutil.get_company_topics(db, cid):
            msg_publish = dbutil.get_topic_message_company_publish(db, topic)
            company.setdefault('nested_tag', []).append({
                'id': topic.topicId,
                'published': msg_publish,
                "category": "topic"
            })
            topic_tag = self.topic_tags.get(topic.topicId)
            if topic_tag:
                company.setdefault('tags', []).append(topic_tag.lower())
        sectors = dbutil.get_company_sector_tag(db, cid)
        company['sector'] = sectors

        # description
        desc = dbutil.get_company_solid_description(db, cid)
        if desc and desc.strip():
            desc = filter(lambda x: (x not in self.stopwords) and len(x) > 1,
                          list(self.seg.cut4search(desc)))
            company['description'] = (' '.join(desc)).lower()

        # round and investors and members
        round = dbutil.get_company_round(db, cid)
        company['round'] = 1000 if round == 0 else round
        company['sort_round'] = dbutil.get_round_sort(db, company.get('round'))
        status = dbutil.get_company_status(db, cid)
        if status in {2020, 2025}:
            company['status'] = status
        elif dbutil.get_company_ipo_status(db, cid):
            company['status'] = -1
        else:
            company['status'] = -2
        company['investors'] = dbutil.get_company_investor_names(db, cid)
        company['investorId'] = dbutil.get_company_investors(db, cid)
        company['members'] = [
            name for _, name in dbutil.get_member_idname(db, cid)
        ]

        # location
        lid, lname = dbutil.get_company_location(db, cid)
        company['location'] = lid

        # establish date, create date, count of company message
        establish_date = dbutil.get_company_establish_date(db, cid)
        try:
            company['established'] = int(establish_date.strftime('%Y%m'))
        except Exception, e:
            pass