def __init__(self, method='default'): self.db = dbconn.connect_torndb() self.mongo = dbconn.connect_mongo() # self.scorer = CompanyTagsRelevance() self.scorer = CompanyUserRelevance() self.daily_recommendation_size = 2 self.pool_size = 100 if method == 'controlled': self.general_pusher = PushScorer() self.candidates = dbutil.get_all_push_pool(self.db) self.__update_push_pool() if method == 'default': self.general_pusher = PushScorer() self.candidates = self.general_pusher.promote_general(self.db) self.__update_push_pool() # rounds and locations self.rounds = { cid: dbutil.get_company_round(self.db, cid) for cid in self.candidates } self.locations = { cid: dbutil.get_company_location(self.db, cid)[0] for cid in self.candidates }
def get_investor_portfolio_companies(db, mongo, iid): companies = {} pfls = db.query( 'select distinct company.id cid ' 'from company, funding, funding_investor_rel rel, corporate cp ' 'where rel.investorId=%s and funding.corporateId = company.corporateId ' 'and (company.active is null or company.active="Y") ' 'and company.corporateId=cp.id and (cp.active is null or cp.active="Y") ' 'and rel.fundingId=funding.id and (funding.active is null or funding.active="Y") ' 'and (rel.active is null or rel.active="Y") ' 'and funding.fundingDate>="2013-01-01" and funding.fundingDate<="2018-06-01" ' 'order by fundingDate asc;', iid) cids = [pfl.cid for pfl in pfls] # tags tags = {} for cid in cids: for t in dbutil.get_company_tags_info(db, cid, [11012, 11013]): tags[t.tid] = tags.get(t.tid, 0) + 1 normalizer = sum(tags.values()) ntags = { dbutil.get_tag_name(db, tid): round(count * 1.0 / normalizer, 4) for tid, count in tags.items() } companies['tags'] = sorted(ntags.items(), key=lambda x: -x[1])[:20] # count of news y2017 = datetime.strptime('2017-01-01', '%Y-%m-%d') companies['news'] = len( list( mongo.article.news.find({ 'investorIds': iid, 'processStatus': 1, 'date': { '$gte': y2017 } }))) # locations locations = [dbutil.get_company_location(db, cid)[1] for cid in cids] locations = { l: round(locations.count(l) * 1.0 / len(locations), 4) for l in set(locations) } companies['location'] = locations return companies
def init_filter(self): portfilios = { iid: dbutil.get_investor_portfilio(self.db, iid, ('2016-01-01', '2017-10-31')) for iid in self.funds.keys() } fund_rounds = { iid: np.mean([dbutil.get_company_round(self.db, p.cid) for p in ps]) for iid, ps in portfilios.iteritems() } fund_activeness = {iid: len(ps) for iid, ps in portfilios.iteritems()} fund_locations = { iid: len( filter( lambda x: dbutil.get_company_location(self.db, x.cid)[0] < 371, ps)) * 2 > len(ps) for iid, ps in portfilios.iteritems() } return fund_rounds, fund_activeness, fund_locations
def create_single(self, db, cid): """ create a single index for a particular company, completion id consists of its type and original id, including cxxxx, fxxx, axxxx, pxxxx, nxxxx, standing for company, full, artifact, product, nick kxxxx, keyword """ # check whether to index this cid if not dbutil.get_company_index_type(db, cid): self.logger.info('should not index %s' % cid) return company = {} alias = set() company_score = dbutil.get_company_score(db, cid, 37020) company['ranking_score'] = company_score name = dbutil.get_company_name(db, cid).lower().replace(' ', '') code = dbutil.get_company_code(db, cid) company['cid'] = code completion = { 'id': cid, '_name': name, '_code': code, '_prompt': 'name', } # First, Names # short name alias.add(name.lower()) alias.add(''.join(lazy_pinyin(name.lower()))) # full name full = dbutil.get_company_corporate_name(db, cid, False) if full and full.strip(): alias.add(full.lower()) # TODO temp solution alias.add(full.lower().replace(u'北京', '').replace(u'上海', '').replace(u'深圳', '')) # artifact name aresults = dbutil.get_artifact_idname_from_cid(db, cid, True) if aresults: alias.update([ self.valid_name(aname) for _, aname in aresults if self.valid_name(aname) ]) # alias aliass = dbutil.get_alias_idname(db, cid) if aliass and len(aliass) < 20: alias.update([ self.valid_name(aname) for _, aname in aliass if self.valid_name(aname) ]) # corporate name corporate = dbutil.get_company_corporate_name(db, cid) if corporate and corporate.strip(): alias.add(corporate.lower()) # corporate full name corporate_full = dbutil.get_company_corporate_name(db, cid, False) if corporate_full and corporate_full.strip(): alias.add(corporate_full.lower()) # corporate alias corporate_alias = dbutil.get_corporate_alias(db, cid) if corporate_alias and len(corporate_alias) < 20: alias.update([ self.valid_name(aname) for aname in corporate_alias if self.valid_name(aname) ]) # check if there is a relevant digital coin dt = dbutil.get_company_digital_coin_info(db, cid) if dt: alias.add(dt.symbol.lower()) # short name if dt.name: alias.add(dt.name.lower().replace(' ', '')) # english name if dt.enname: alias.add(dt.enname.lower()) # create indice names completion['completionName'] = list(alias) company['name'] = name.lower() company['alias'] = self.analyze_names(alias) # Second, team identify, investor identify team = self.identifier.identify(cid) if team and len(team) > 0: company['team'] = team if dbutil.exist_company_tag(db, cid, 309129): company['investor'] = 44010 # Third, keywords # regular tag tags_info = dbutil.get_company_tags_idname(db, cid, tag_out_type=(11000, 11001, 11002)) if tags_info: for tid, tname, weight in tags_info: company.setdefault('tags', []).append(tname.lower()) # yellows, --> forget y take this out yellows = dbutil.get_company_tags_yellow(db, cid) if yellows: company['yellows'] = [yellow.lower() for yellow in yellows] # Forth, description desc = dbutil.get_company_solid_description(db, cid) if desc and desc.strip(): desc = filter(lambda x: (x not in self.stopwords) and len(x) > 1, list(self.seg.cut4search(desc))) company['description'] = (' '.join(desc)).lower() # Fifth, round and investors and members company['round'] = dbutil.get_company_round(db, cid) company['investors'] = dbutil.get_company_investor_names(db, cid) company['members'] = [ name for _, name in dbutil.get_member_idname(db, cid) ] # Sixth, location lid, lname = dbutil.get_company_location(db, cid) company['location'] = lid # Seventh, establish date, create date, count of company message establish_date = dbutil.get_company_establish_date(db, cid) try: company['established'] = int(establish_date.strftime('%Y%m')) except Exception, e: pass
def create_single(self, db, funding): global logger_universale_index # funding that is not active if not dbutil.get_funding_index_type(db, funding.id): return event = {'fid': funding.id} event['investorId'] = dbutil.get_funding_investor_ids(db, funding.id) event['investor'] = [ dbutil.get_investor_name(db, iid) for iid in event.get('investorId', []) ] # previous investors if funding.fundingDate: previous_fundings = [ f.id for f in dbutil.get_company_funding(db, funding.companyId) if f.fundingDate and f.fundingDate < funding.fundingDate ] previous_iids = set( chain(*[ dbutil.get_funding_investor_ids(db, fid) for fid in previous_fundings ])) event['previous_investor'] = [ dbutil.get_investor_name(db, iid) for iid in previous_iids if iid ] event['location'] = dbutil.get_company_location(db, funding.companyId)[0] sectors = dbutil.get_company_sector_tag(db, funding.companyId) event['sector'] = sectors[0] if len(sectors) > 0 else 0 tags_info = dbutil.get_company_tags_idname(db, funding.companyId, tag_out_type=(11000, 11001, 11002)) if tags_info: for tid, tname, weight in tags_info: event.setdefault('tags', []).append(tname.lower()) event['round'] = funding.round event['sort_round'] = dbutil.get_round_sort(db, funding.round) if funding.investment: precise = {'Y': 1, 'N': 5}.get(funding.precise, 1) investment = funding.investment * precise * dbutil.get_currency_rate( db, funding.currency) / 10000 event['last_funding_amount'] = investment else: event['last_funding_amount'] = None event['last_funding_date'] = funding.fundingDate event[ 'funding_year'] = funding.fundingDate.year if funding.fundingDate else None event['publish_date'] = funding.publishDate event['source'] = funding.source if funding.source else 0 event['sort_sector'] = dbutil.get_tag_novelty( db, sectors[0]) if len(sectors) > 0 else None event['sort_location'] = dbutil.get_company_location( db, funding.companyId, True)[1] self.es.index(index="xiniudata2", doc_type='event', id=funding.id, body=event)
def __valid_company(self, cid): lid = dbutil.get_company_location(self.db, cid)[0] if lid and lid > 370: return False return True
def create_single(self, db, cid): global logger_universal_index # check whether to index this cid if not dbutil.get_company_index_type(db, cid): logger_universal_index.info('should not index %s' % cid) return company = {} alias, artifacts = set(), set() company['ranking_score'] = dbutil.get_company_score(db, cid, 37020) name = dbutil.get_company_name(db, cid).lower().replace(' ', '') code = dbutil.get_company_code(db, cid) company['id'] = code # short name alias.add(name.lower()) alias.add(''.join(lazy_pinyin(name.lower()))) # full name full = dbutil.get_company_corporate_name(db, cid, False) if full and full.strip(): alias.add(full.lower()) alias.add(full.lower().replace(u'北京', '').replace(u'上海', '').replace( u'深圳', '').replace(u'成都', '')) # artifact name aresults = dbutil.get_artifact_idname_from_cid(db, cid, True) if aresults: alias.update([ self.valid_name(aname) for _, aname in aresults if self.valid_name(aname) ]) # alias aliass = dbutil.get_alias_idname(db, cid) if aliass and len(aliass) < 20: alias.update([ self.valid_name(aname) for _, aname in aliass if self.valid_name(aname) ]) # corporate name corporate = dbutil.get_company_corporate_name(db, cid) if corporate and corporate.strip(): alias.add(corporate.lower()) # corporate full name corporate_full = dbutil.get_company_corporate_name(db, cid, False) if corporate_full and corporate_full.strip(): alias.add(corporate_full.lower()) # corporate alias corporate_alias = dbutil.get_corporate_alias(db, cid) if corporate_alias and len(corporate_alias) < 20: alias.update([ self.valid_name(aname) for aname in corporate_alias if self.valid_name(aname) ]) # check if there is a relevant digital coin dt = dbutil.get_company_digital_coin_info(db, cid) if dt: alias.add(dt.symbol.lower()) # short name if dt.name: alias.add(dt.name.lower().replace(' ', '')) # english name if dt.enname: alias.add(dt.enname.lower()) # create indice names company['name'] = name.lower() company['alias'] = self.analyze_names(alias) # tag tags_info = dbutil.get_company_tags_idname(db, cid, tag_out_type=(11000, 11001, 11002)) if tags_info: for tid, tname, weight in tags_info: company.setdefault('tags', []).append(tname.lower()) company.setdefault('features', []).append(tid) company['nested_tag'] = [] for industry in dbutil.get_company_industries(db, cid): company.setdefault('nested_tag', []).append({ 'id': industry.industryId, 'published': industry.publishTime, "category": "industry" }) for topic in dbutil.get_company_topics(db, cid): msg_publish = dbutil.get_topic_message_company_publish(db, topic) company.setdefault('nested_tag', []).append({ 'id': topic.topicId, 'published': msg_publish, "category": "topic" }) topic_tag = self.topic_tags.get(topic.topicId) if topic_tag: company.setdefault('tags', []).append(topic_tag.lower()) sectors = dbutil.get_company_sector_tag(db, cid) company['sector'] = sectors # description desc = dbutil.get_company_solid_description(db, cid) if desc and desc.strip(): desc = filter(lambda x: (x not in self.stopwords) and len(x) > 1, list(self.seg.cut4search(desc))) company['description'] = (' '.join(desc)).lower() # round and investors and members round = dbutil.get_company_round(db, cid) company['round'] = 1000 if round == 0 else round company['sort_round'] = dbutil.get_round_sort(db, company.get('round')) status = dbutil.get_company_status(db, cid) if status in {2020, 2025}: company['status'] = status elif dbutil.get_company_ipo_status(db, cid): company['status'] = -1 else: company['status'] = -2 company['investors'] = dbutil.get_company_investor_names(db, cid) company['investorId'] = dbutil.get_company_investors(db, cid) company['members'] = [ name for _, name in dbutil.get_member_idname(db, cid) ] # location lid, lname = dbutil.get_company_location(db, cid) company['location'] = lid # establish date, create date, count of company message establish_date = dbutil.get_company_establish_date(db, cid) try: company['established'] = int(establish_date.strftime('%Y%m')) except Exception, e: pass
if lfd: if lfd.fundingDate: company['last_funding_date'] = lfd.fundingDate if lfd.investment: company['last_funding_amount'] = (lfd.investment * { 'Y': 1, 'N': 5 }.get(lfd.precise, 1)) / 10000 company['fa_date'] = dbutil.get_company_latest_fa_date(db, cid) company['num_cm'] = len(list(dbutil.get_company_messages(db, cid, "Y"))) # sort value company['sort_sector'] = dbutil.get_tag_novelty( db, sectors[0]) if len(sectors) > 0 else None company['sort_location'] = dbutil.get_company_location(db, cid, True)[1] # create index # print company self.create_index(company, 'universal', code) def create_index(self, item, doc, iid=None): iid = iid if iid else item.get('id') if iid: self.es.index(index="xiniudata2", doc_type=doc, id=iid, body=item) def valid_name(self, name): name = name.replace(u'・', u'-').replace(u'-', u'-').split(u'-')[0] if len(name) < 20: