def __init__(self, method='default'): self.db = dbconn.connect_torndb() self.mongo = dbconn.connect_mongo() # self.scorer = CompanyTagsRelevance() self.scorer = CompanyUserRelevance() self.daily_recommendation_size = 2 self.pool_size = 100 if method == 'controlled': self.general_pusher = PushScorer() self.candidates = dbutil.get_all_push_pool(self.db) self.__update_push_pool() if method == 'default': self.general_pusher = PushScorer() self.candidates = self.general_pusher.promote_general(self.db) self.__update_push_pool() # rounds and locations self.rounds = { cid: dbutil.get_company_round(self.db, cid) for cid in self.candidates } self.locations = { cid: dbutil.get_company_location(self.db, cid)[0] for cid in self.candidates }
def new_dominator(self, today=None, type='free', genre=None): today = today or datetime.today() one_week_before, three_month_before = today - timedelta(days=8), today - timedelta(days=90) dominate_domain = [item['_id'] for item in list(self.mongo.trend.appstore_rank.aggregate([ {'$match': {'date': {'$gt': one_week_before, '$lte': today}, 'rank': {'$lte': 10}, 'type': type, 'genre': genre}}, {'$group': {'_id': '$trackId', 'times': {'$sum': 1}}}, {'$match': {'times': {'$gte': 7}}}]))] def never_dominate_before(track_id): top_rank = list(self.mongo.trend.appstore_rank.find({'trackId': track_id, 'type': type, 'genre': genre, 'date': {'$gt': three_month_before, '$lte': one_week_before}} ).sort([('rank', 1)]).limit(10)) return top_rank[-1]['rank'] > 30 if top_rank else True new_dominate_domain = filter(never_dominate_before, dominate_domain) new_dominator = set() for track_id in new_dominate_domain: for aid in dbutil.get_artifacts_from_iOS(self.db, track_id): cid = dbutil.get_artifact_company(self.db, aid) corp_round = dbutil.get_company_round(self.db, cid) if corp_round < 1060: app_name = self.db.query('select name from artifact where id = %s' % (aid))[0]['name'] company_name = dbutil.get_company_name(self.db, cid) logger_track.info('\nDate: %s Genre: %s Type: %s\nDominator: %s Company: %s\n\n' % (today, genre, type, app_name, company_name)) new_dominator.add((cid, company_name, app_name)) return new_dominator
def update_3109(self, today=None): global logger_track today = today or datetime.today() one_week_before, three_month_before = today - timedelta(days=8), today - timedelta(days=90) types = ['free', 'charge', 'grossing'] genres = self.__get_genres() for t in types: for g in genres: outstanding_apps_candidates = [item['_id'] for item in list(self.mongo.trend.appstore_rank.aggregate([ {'$match': {'date': {'$gt': one_week_before, '$lte': today}, 'rank': {'$lte': 10}, 'type': t, 'genre': g}}, {'$group': {'_id': '$trackId', 'times': {'$sum': 1}}}, {'$match': {'times': {'$gte': 7}}}]))] def previous_perform_poorly(track_id): top_rank = list(self.mongo.trend.appstore_rank.find({'trackId': track_id, 'type': t, 'genre': g, 'date': {'$gt': three_month_before, '$lte': one_week_before}} ).sort([('rank', 1)]).limit(10)) return top_rank[-1]['rank'] > 30 if top_rank else True outstanding_apps = filter(previous_perform_poorly, outstanding_apps_candidates) for track_id in outstanding_apps: for aid in dbutil.get_artifacts_from_iOS(self.db, track_id): cid = dbutil.get_artifact_company(self.db, aid) corp_round = dbutil.get_company_round(self.db, cid) if corp_round < 1060: msg = u'%s旗下 %s 近期在AppStore%s排名表现突出' % \ (dbutil.get_company_name(self.db, cid), self.__normalize_iOS_name(dbutil.get_artifact_name(self.db, aid)), self.__get_rank_name(g, t)) detail = '%s,%s' % (g, t) dbutil.update_continuous_company_message(self.db, cid, msg, 3109, 30, aid, 7, detail) logger_track.info('3109, %s, %s, %s, %s' % (cid, aid, t, g))
def __source_database(db, mongo, yesterday, day_seven): aggregates = [ item.get('newCorporateIds', []) for item in mongo.task.corporate_decompose.find({'modifyTime': { '$gt': day_seven }}) ] aggregates = set(chain(*aggregates)) for c in db.query( 'select company.id id, source_company.source source from company, source_company ' 'where company.createTime>%s and company.modifyTime>%s and company.id=source_company.companyId ' 'and (company.active is null or company.active="Y") and ' '(source_company.active is null or source_company.active="Y");', day_seven, yesterday): if dbutil.get_company_round(db, c.id) > 1040: continue if dbutil.get_company_establish_date(db, c.id).year < 2016: continue if dbutil.get_company_corporate_id(db, c.id) in aggregates: continue if dbutil.get_company_source(db, c.id) == set([13050]): continue dbutil.update_extract_source_company(db, 67001, c.source, c.id, only_insert=False)
def __extract_11120(self, cid): # 早期公司,小于等于B轮,成立时间在2010年之后 if 0 <= dbutil.get_company_round(self.db, cid) <= 1040 \ and dbutil.get_company_establish_date(self.db, cid).year >= 2010: yield 579089 # 公司状态,融资中 等 status = dbutil.get_company_status(self.db, cid) if status in (2010, 2020, 2025): yield { 2010: 589014, 2015: 589015, 2020: 589016, 2025: 589017 }[status]
def __source_news(db, mongo, today, yesterday): bad_news = [ r.get('companyIds', []) for r in mongo.article.news.find({ 'createTime': { '$gt': yesterday, '$lt': today }, 'processStatus': 1, 'type': 60001, 'features': { '$ne': 578362 }, 'modifyUser': { '$ne': 139 } }) ] bad_news = set(chain(*bad_news)) for record in mongo.article.news.find({ 'createTime': { '$gt': yesterday, '$lt': today }, 'processStatus': 1, 'type': 60001, 'features': { '$ne': 578362 }, 'modifyUser': { '$ne': 139 } }): for cid in record.get('companyIds', []): if cid == 449316 or cid == 416649: dbutil.update_extract_source_company(db, 67002, record['source'], cid, record['_id'], False) if cid in bad_news: continue if dbutil.get_company_round(db, cid) > 1040: continue if dbutil.get_company_establish_date(db, cid).year < 2010: continue dbutil.update_extract_source_company(db, 67002, record['source'], cid, record['_id'], False)
def init_filter(self): portfilios = { iid: dbutil.get_investor_portfilio(self.db, iid, ('2016-01-01', '2017-10-31')) for iid in self.funds.keys() } fund_rounds = { iid: np.mean([dbutil.get_company_round(self.db, p.cid) for p in ps]) for iid, ps in portfilios.iteritems() } fund_activeness = {iid: len(ps) for iid, ps in portfilios.iteritems()} fund_locations = { iid: len( filter( lambda x: dbutil.get_company_location(self.db, x.cid)[0] < 371, ps)) * 2 > len(ps) for iid, ps in portfilios.iteritems() } return fund_rounds, fund_activeness, fund_locations
def __source_module_71001(db, mongo, yesterday, day_seven): aggregates = [ item.get('newCorporateIds', []) for item in mongo.task.corporate_decompose.find({'modifyTime': { '$gt': day_seven }}) ] aggregates = set(chain(*aggregates)) # for c in db.query('select company.id id, source_company.source source from company, source_company ' # 'where company.createTime>%s and company.modifyTime>%s and company.id=source_company.companyId ' # 'and (company.active is null or company.active="Y") and ' # '(source_company.active is null or source_company.active="Y");', day_seven, yesterday): for tc in mongo.task.company.find({ 'finishTime': { '$gte': yesterday }, 'processStatus': 1, 'types': 'company_job' }): cid = tc.get('companyId') if dbutil.get_company_active(db, cid) == 'Y': if dbutil.get_company_round(db, cid) > 1040: continue # if dbutil.get_company_establish_date(db, cid).year < 2000: # continue # if dbutil.get_company_corporate_id(db, cid) in aggregates: # continue if dbutil.get_company_source(db, cid) == {13050}: dbutil.update_extract_source_company(db, 67001, 13050, cid, only_insert=False) dbutil.update_custom_sourcing_company(db, cid, 71001, day_seven)
def create_single(self, db, cid): """ create a single index for a particular company, completion id consists of its type and original id, including cxxxx, fxxx, axxxx, pxxxx, nxxxx, standing for company, full, artifact, product, nick kxxxx, keyword """ # check whether to index this cid if not dbutil.get_company_index_type(db, cid): self.logger.info('should not index %s' % cid) return company = {} alias = set() company_score = dbutil.get_company_score(db, cid, 37020) company['ranking_score'] = company_score name = dbutil.get_company_name(db, cid).lower().replace(' ', '') code = dbutil.get_company_code(db, cid) company['cid'] = code completion = { 'id': cid, '_name': name, '_code': code, '_prompt': 'name', } # First, Names # short name alias.add(name.lower()) alias.add(''.join(lazy_pinyin(name.lower()))) # full name full = dbutil.get_company_corporate_name(db, cid, False) if full and full.strip(): alias.add(full.lower()) # TODO temp solution alias.add(full.lower().replace(u'北京', '').replace(u'上海', '').replace(u'深圳', '')) # artifact name aresults = dbutil.get_artifact_idname_from_cid(db, cid, True) if aresults: alias.update([ self.valid_name(aname) for _, aname in aresults if self.valid_name(aname) ]) # alias aliass = dbutil.get_alias_idname(db, cid) if aliass and len(aliass) < 20: alias.update([ self.valid_name(aname) for _, aname in aliass if self.valid_name(aname) ]) # corporate name corporate = dbutil.get_company_corporate_name(db, cid) if corporate and corporate.strip(): alias.add(corporate.lower()) # corporate full name corporate_full = dbutil.get_company_corporate_name(db, cid, False) if corporate_full and corporate_full.strip(): alias.add(corporate_full.lower()) # corporate alias corporate_alias = dbutil.get_corporate_alias(db, cid) if corporate_alias and len(corporate_alias) < 20: alias.update([ self.valid_name(aname) for aname in corporate_alias if self.valid_name(aname) ]) # check if there is a relevant digital coin dt = dbutil.get_company_digital_coin_info(db, cid) if dt: alias.add(dt.symbol.lower()) # short name if dt.name: alias.add(dt.name.lower().replace(' ', '')) # english name if dt.enname: alias.add(dt.enname.lower()) # create indice names completion['completionName'] = list(alias) company['name'] = name.lower() company['alias'] = self.analyze_names(alias) # Second, team identify, investor identify team = self.identifier.identify(cid) if team and len(team) > 0: company['team'] = team if dbutil.exist_company_tag(db, cid, 309129): company['investor'] = 44010 # Third, keywords # regular tag tags_info = dbutil.get_company_tags_idname(db, cid, tag_out_type=(11000, 11001, 11002)) if tags_info: for tid, tname, weight in tags_info: company.setdefault('tags', []).append(tname.lower()) # yellows, --> forget y take this out yellows = dbutil.get_company_tags_yellow(db, cid) if yellows: company['yellows'] = [yellow.lower() for yellow in yellows] # Forth, description desc = dbutil.get_company_solid_description(db, cid) if desc and desc.strip(): desc = filter(lambda x: (x not in self.stopwords) and len(x) > 1, list(self.seg.cut4search(desc))) company['description'] = (' '.join(desc)).lower() # Fifth, round and investors and members company['round'] = dbutil.get_company_round(db, cid) company['investors'] = dbutil.get_company_investor_names(db, cid) company['members'] = [ name for _, name in dbutil.get_member_idname(db, cid) ] # Sixth, location lid, lname = dbutil.get_company_location(db, cid) company['location'] = lid # Seventh, establish date, create date, count of company message establish_date = dbutil.get_company_establish_date(db, cid) try: company['established'] = int(establish_date.strftime('%Y%m')) except Exception, e: pass
def __process_step1(self, record): global producer_news_task # not news if record.get('processStatus', 0) != 1: self.mongo.article.news.update( {'_id': ObjectId(record['news_id'])}, {'$set': { 'processStatus': -1 }}) return if record.get('source', 0) == 'gongshang': return # update article news category = self.__map_category(record.get('categories', [])) cids = record.get('companyIds', []) iids = record.get('investorIds', []) if category: self.mongo.article.news.update( {'_id': ObjectId(record['news_id'])}, {'$set': { 'category': category }}) self.mongo.article.news.update({'_id': ObjectId(record['news_id'])}, { '$set': { 'companyIds': cids, 'investorIds': iids, 'modifyUser': record['modifyUser'], 'categories': record.get('categories', []) } }) # prepare features features = set() features.update(record.get('categories', [])) features.add(record.get('sentiment')) # sector relevant features # orginal_features = [int(tid) for tid in # self.mongo.article.news.find_one({'_id': ObjectId(record['news_id'])}).get('features', [])] # industry_tags = [tid for tid in orginal_features if dbutil.get_tag_info(self.db, tid).type < 11050] # features.update(industry_tags) if {128, 578353, 578349, 578351, 578356, 578351} & set( record.get('categories', [])): features.update(record.get('newsTags', [])) # generate step 2 task sector_update_flag = False if cids: startsups = filter( lambda cid: dbutil.get_company_round(self.db, cid) < 1041, cids) if startsups: news_tags = list( chain(*[[ t.tid for t in dbutil.get_company_tags_info(self.db, cid) if t.verify and t.verify == "Y" ] for cid in cids])) news_sectors = self.__map_sectors(news_tags) if len(news_sectors) <= 3: sector_update_flag = True features.update(news_tags) self.mongo.article.news.update( {'_id': ObjectId(record['news_id'])}, {'$set': { 'sectors': news_sectors }}) if not sector_update_flag: task2 = { 'news_id': str(record['news_id']), 'taskNewsId': str(record['_id']), 'createTime': datetime.utcnow(), 'newsTags': record.get('newsTags', []), 'companyIds': cids, 'processStatus': int(0), 'section': 'step2' } if self.mongo.task.news.find({ 'taskNewsId': str(record['_id']) }).count() == 0: self.mongo.task.news.insert_one(task2) # update article news self.mongo.article.news.update( {'_id': ObjectId(record['news_id'])}, {'$set': { 'processStatus': 1, 'modifyTime': datetime.utcnow() }}) self.__update_news_features(record['news_id'], features, 'skip') # re produce tags for mentioned companies for cid in cids: self.company_tagger.extract(cid, fast=True, update_only=True) # 大公司打上大公司标签 if 578354 in features: for cid in cids: if self.mongo.article.news.find({ 'processStatus': 1, 'companyIds': cid, 'features': 578354 }).count() >= 3: dbutil.update_company_tag(self.db, cid, 599843, 0, active='H') # track for company message and investor message for (feed_back, feed_type) in self.news_tracker.feed_1001_4tasks([record]): if feed_back: if feed_type == 'cm': self.news_tracker.send_company_message_msg(feed_back) elif feed_type == 'im': self.news_tracker.send_investor_message_msg(feed_back) # track for topic 30, 首次媒体报道 self.track_topic_30(record) # send message to task company source = 'news_funding' if category == 60101 else 'news_regular' try: producer_news_task.send_messages( "task_company", json.dumps({ 'source': source, 'id': record['news_id'], 'posting_time': datetime.now().strftime('%Y-%m-%d:%H:%M:%S') })) except FailedPayloadsError, fpe: init_kafka() producer_news_task.send_messages( "task_company", json.dumps({ 'source': source, 'id': record['news_id'], 'posting_time': datetime.now().strftime('%Y-%m-%d:%H:%M:%S') }))
def create_single(self, db, cid): global logger_universal_index # check whether to index this cid if not dbutil.get_company_index_type(db, cid): logger_universal_index.info('should not index %s' % cid) return company = {} alias, artifacts = set(), set() company['ranking_score'] = dbutil.get_company_score(db, cid, 37020) name = dbutil.get_company_name(db, cid).lower().replace(' ', '') code = dbutil.get_company_code(db, cid) company['id'] = code # short name alias.add(name.lower()) alias.add(''.join(lazy_pinyin(name.lower()))) # full name full = dbutil.get_company_corporate_name(db, cid, False) if full and full.strip(): alias.add(full.lower()) alias.add(full.lower().replace(u'北京', '').replace(u'上海', '').replace( u'深圳', '').replace(u'成都', '')) # artifact name aresults = dbutil.get_artifact_idname_from_cid(db, cid, True) if aresults: alias.update([ self.valid_name(aname) for _, aname in aresults if self.valid_name(aname) ]) # alias aliass = dbutil.get_alias_idname(db, cid) if aliass and len(aliass) < 20: alias.update([ self.valid_name(aname) for _, aname in aliass if self.valid_name(aname) ]) # corporate name corporate = dbutil.get_company_corporate_name(db, cid) if corporate and corporate.strip(): alias.add(corporate.lower()) # corporate full name corporate_full = dbutil.get_company_corporate_name(db, cid, False) if corporate_full and corporate_full.strip(): alias.add(corporate_full.lower()) # corporate alias corporate_alias = dbutil.get_corporate_alias(db, cid) if corporate_alias and len(corporate_alias) < 20: alias.update([ self.valid_name(aname) for aname in corporate_alias if self.valid_name(aname) ]) # check if there is a relevant digital coin dt = dbutil.get_company_digital_coin_info(db, cid) if dt: alias.add(dt.symbol.lower()) # short name if dt.name: alias.add(dt.name.lower().replace(' ', '')) # english name if dt.enname: alias.add(dt.enname.lower()) # create indice names company['name'] = name.lower() company['alias'] = self.analyze_names(alias) # tag tags_info = dbutil.get_company_tags_idname(db, cid, tag_out_type=(11000, 11001, 11002)) if tags_info: for tid, tname, weight in tags_info: company.setdefault('tags', []).append(tname.lower()) company.setdefault('features', []).append(tid) company['nested_tag'] = [] for industry in dbutil.get_company_industries(db, cid): company.setdefault('nested_tag', []).append({ 'id': industry.industryId, 'published': industry.publishTime, "category": "industry" }) for topic in dbutil.get_company_topics(db, cid): msg_publish = dbutil.get_topic_message_company_publish(db, topic) company.setdefault('nested_tag', []).append({ 'id': topic.topicId, 'published': msg_publish, "category": "topic" }) topic_tag = self.topic_tags.get(topic.topicId) if topic_tag: company.setdefault('tags', []).append(topic_tag.lower()) sectors = dbutil.get_company_sector_tag(db, cid) company['sector'] = sectors # description desc = dbutil.get_company_solid_description(db, cid) if desc and desc.strip(): desc = filter(lambda x: (x not in self.stopwords) and len(x) > 1, list(self.seg.cut4search(desc))) company['description'] = (' '.join(desc)).lower() # round and investors and members round = dbutil.get_company_round(db, cid) company['round'] = 1000 if round == 0 else round company['sort_round'] = dbutil.get_round_sort(db, company.get('round')) status = dbutil.get_company_status(db, cid) if status in {2020, 2025}: company['status'] = status elif dbutil.get_company_ipo_status(db, cid): company['status'] = -1 else: company['status'] = -2 company['investors'] = dbutil.get_company_investor_names(db, cid) company['investorId'] = dbutil.get_company_investors(db, cid) company['members'] = [ name for _, name in dbutil.get_member_idname(db, cid) ] # location lid, lname = dbutil.get_company_location(db, cid) company['location'] = lid # establish date, create date, count of company message establish_date = dbutil.get_company_establish_date(db, cid) try: company['established'] = int(establish_date.strftime('%Y%m')) except Exception, e: pass