Ejemplo n.º 1
0
    def __init__(self, method='default'):

        self.db = dbconn.connect_torndb()
        self.mongo = dbconn.connect_mongo()
        # self.scorer = CompanyTagsRelevance()
        self.scorer = CompanyUserRelevance()

        self.daily_recommendation_size = 2
        self.pool_size = 100

        if method == 'controlled':
            self.general_pusher = PushScorer()
            self.candidates = dbutil.get_all_push_pool(self.db)
            self.__update_push_pool()
        if method == 'default':
            self.general_pusher = PushScorer()
            self.candidates = self.general_pusher.promote_general(self.db)
            self.__update_push_pool()

        # rounds and locations
        self.rounds = {
            cid: dbutil.get_company_round(self.db, cid)
            for cid in self.candidates
        }
        self.locations = {
            cid: dbutil.get_company_location(self.db, cid)[0]
            for cid in self.candidates
        }
Ejemplo n.º 2
0
def get_investor_portfolio_companies(db, mongo, iid):

    companies = {}
    pfls = db.query(
        'select distinct company.id cid '
        'from company, funding, funding_investor_rel rel, corporate cp '
        'where rel.investorId=%s and funding.corporateId = company.corporateId '
        'and (company.active is null or company.active="Y") '
        'and company.corporateId=cp.id and (cp.active is null or cp.active="Y") '
        'and rel.fundingId=funding.id and (funding.active is null or funding.active="Y") '
        'and (rel.active is null or rel.active="Y") '
        'and funding.fundingDate>="2013-01-01" and funding.fundingDate<="2018-06-01" '
        'order by fundingDate asc;', iid)
    cids = [pfl.cid for pfl in pfls]
    # tags
    tags = {}
    for cid in cids:
        for t in dbutil.get_company_tags_info(db, cid, [11012, 11013]):
            tags[t.tid] = tags.get(t.tid, 0) + 1
    normalizer = sum(tags.values())
    ntags = {
        dbutil.get_tag_name(db, tid): round(count * 1.0 / normalizer, 4)
        for tid, count in tags.items()
    }
    companies['tags'] = sorted(ntags.items(), key=lambda x: -x[1])[:20]
    # count of news
    y2017 = datetime.strptime('2017-01-01', '%Y-%m-%d')
    companies['news'] = len(
        list(
            mongo.article.news.find({
                'investorIds': iid,
                'processStatus': 1,
                'date': {
                    '$gte': y2017
                }
            })))
    # locations
    locations = [dbutil.get_company_location(db, cid)[1] for cid in cids]
    locations = {
        l: round(locations.count(l) * 1.0 / len(locations), 4)
        for l in set(locations)
    }
    companies['location'] = locations
    return companies
Ejemplo n.º 3
0
    def init_filter(self):

        portfilios = {
            iid: dbutil.get_investor_portfilio(self.db, iid,
                                               ('2016-01-01', '2017-10-31'))
            for iid in self.funds.keys()
        }
        fund_rounds = {
            iid:
            np.mean([dbutil.get_company_round(self.db, p.cid) for p in ps])
            for iid, ps in portfilios.iteritems()
        }
        fund_activeness = {iid: len(ps) for iid, ps in portfilios.iteritems()}
        fund_locations = {
            iid: len(
                filter(
                    lambda x: dbutil.get_company_location(self.db, x.cid)[0] <
                    371, ps)) * 2 > len(ps)
            for iid, ps in portfilios.iteritems()
        }
        return fund_rounds, fund_activeness, fund_locations
Ejemplo n.º 4
0
    def create_single(self, db, cid):
        """
        create a single index for a particular company,
        completion id consists of its type and original id, including
            cxxxx, fxxx, axxxx, pxxxx, nxxxx, standing for company, full, artifact, product, nick
            kxxxx, keyword
        """

        # check whether to index this cid
        if not dbutil.get_company_index_type(db, cid):
            self.logger.info('should not index %s' % cid)
            return

        company = {}
        alias = set()
        company_score = dbutil.get_company_score(db, cid, 37020)
        company['ranking_score'] = company_score

        name = dbutil.get_company_name(db, cid).lower().replace(' ', '')
        code = dbutil.get_company_code(db, cid)
        company['cid'] = code
        completion = {
            'id': cid,
            '_name': name,
            '_code': code,
            '_prompt': 'name',
        }

        # First, Names
        # short name
        alias.add(name.lower())
        alias.add(''.join(lazy_pinyin(name.lower())))
        # full name
        full = dbutil.get_company_corporate_name(db, cid, False)
        if full and full.strip():
            alias.add(full.lower())
            # TODO temp solution
            alias.add(full.lower().replace(u'北京',
                                           '').replace(u'上海',
                                                       '').replace(u'深圳', ''))
        # artifact name
        aresults = dbutil.get_artifact_idname_from_cid(db, cid, True)
        if aresults:
            alias.update([
                self.valid_name(aname) for _, aname in aresults
                if self.valid_name(aname)
            ])
        # alias
        aliass = dbutil.get_alias_idname(db, cid)
        if aliass and len(aliass) < 20:
            alias.update([
                self.valid_name(aname) for _, aname in aliass
                if self.valid_name(aname)
            ])
        # corporate name
        corporate = dbutil.get_company_corporate_name(db, cid)
        if corporate and corporate.strip():
            alias.add(corporate.lower())
        # corporate full name
        corporate_full = dbutil.get_company_corporate_name(db, cid, False)
        if corporate_full and corporate_full.strip():
            alias.add(corporate_full.lower())
        # corporate alias
        corporate_alias = dbutil.get_corporate_alias(db, cid)
        if corporate_alias and len(corporate_alias) < 20:
            alias.update([
                self.valid_name(aname) for aname in corporate_alias
                if self.valid_name(aname)
            ])
        # check if there is a relevant digital coin
        dt = dbutil.get_company_digital_coin_info(db, cid)
        if dt:
            alias.add(dt.symbol.lower())
            # short name
            if dt.name:
                alias.add(dt.name.lower().replace(' ', ''))
            # english name
            if dt.enname:
                alias.add(dt.enname.lower())

        # create indice names
        completion['completionName'] = list(alias)
        company['name'] = name.lower()
        company['alias'] = self.analyze_names(alias)

        # Second, team identify, investor identify
        team = self.identifier.identify(cid)
        if team and len(team) > 0:
            company['team'] = team
        if dbutil.exist_company_tag(db, cid, 309129):
            company['investor'] = 44010

        # Third, keywords
        # regular tag
        tags_info = dbutil.get_company_tags_idname(db,
                                                   cid,
                                                   tag_out_type=(11000, 11001,
                                                                 11002))
        if tags_info:
            for tid, tname, weight in tags_info:
                company.setdefault('tags', []).append(tname.lower())
        # yellows, --> forget y take this out
        yellows = dbutil.get_company_tags_yellow(db, cid)
        if yellows:
            company['yellows'] = [yellow.lower() for yellow in yellows]

        # Forth, description
        desc = dbutil.get_company_solid_description(db, cid)
        if desc and desc.strip():
            desc = filter(lambda x: (x not in self.stopwords) and len(x) > 1,
                          list(self.seg.cut4search(desc)))
            company['description'] = (' '.join(desc)).lower()

        # Fifth, round and investors and members
        company['round'] = dbutil.get_company_round(db, cid)
        company['investors'] = dbutil.get_company_investor_names(db, cid)
        company['members'] = [
            name for _, name in dbutil.get_member_idname(db, cid)
        ]

        # Sixth, location
        lid, lname = dbutil.get_company_location(db, cid)
        company['location'] = lid

        # Seventh, establish date, create date, count of company message
        establish_date = dbutil.get_company_establish_date(db, cid)
        try:
            company['established'] = int(establish_date.strftime('%Y%m'))
        except Exception, e:
            pass
Ejemplo n.º 5
0
    def create_single(self, db, funding):

        global logger_universale_index

        # funding that is not active
        if not dbutil.get_funding_index_type(db, funding.id):
            return

        event = {'fid': funding.id}
        event['investorId'] = dbutil.get_funding_investor_ids(db, funding.id)
        event['investor'] = [
            dbutil.get_investor_name(db, iid)
            for iid in event.get('investorId', [])
        ]
        # previous investors
        if funding.fundingDate:
            previous_fundings = [
                f.id
                for f in dbutil.get_company_funding(db, funding.companyId)
                if f.fundingDate and f.fundingDate < funding.fundingDate
            ]
            previous_iids = set(
                chain(*[
                    dbutil.get_funding_investor_ids(db, fid)
                    for fid in previous_fundings
                ]))
            event['previous_investor'] = [
                dbutil.get_investor_name(db, iid) for iid in previous_iids
                if iid
            ]
        event['location'] = dbutil.get_company_location(db,
                                                        funding.companyId)[0]
        sectors = dbutil.get_company_sector_tag(db, funding.companyId)
        event['sector'] = sectors[0] if len(sectors) > 0 else 0
        tags_info = dbutil.get_company_tags_idname(db,
                                                   funding.companyId,
                                                   tag_out_type=(11000, 11001,
                                                                 11002))
        if tags_info:
            for tid, tname, weight in tags_info:
                event.setdefault('tags', []).append(tname.lower())
        event['round'] = funding.round
        event['sort_round'] = dbutil.get_round_sort(db, funding.round)
        if funding.investment:
            precise = {'Y': 1, 'N': 5}.get(funding.precise, 1)
            investment = funding.investment * precise * dbutil.get_currency_rate(
                db, funding.currency) / 10000
            event['last_funding_amount'] = investment
        else:
            event['last_funding_amount'] = None
        event['last_funding_date'] = funding.fundingDate
        event[
            'funding_year'] = funding.fundingDate.year if funding.fundingDate else None
        event['publish_date'] = funding.publishDate
        event['source'] = funding.source if funding.source else 0
        event['sort_sector'] = dbutil.get_tag_novelty(
            db, sectors[0]) if len(sectors) > 0 else None
        event['sort_location'] = dbutil.get_company_location(
            db, funding.companyId, True)[1]
        self.es.index(index="xiniudata2",
                      doc_type='event',
                      id=funding.id,
                      body=event)
Ejemplo n.º 6
0
    def __valid_company(self, cid):

        lid = dbutil.get_company_location(self.db, cid)[0]
        if lid and lid > 370:
            return False
        return True
Ejemplo n.º 7
0
    def create_single(self, db, cid):

        global logger_universal_index
        # check whether to index this cid
        if not dbutil.get_company_index_type(db, cid):
            logger_universal_index.info('should not index %s' % cid)
            return

        company = {}
        alias, artifacts = set(), set()
        company['ranking_score'] = dbutil.get_company_score(db, cid, 37020)

        name = dbutil.get_company_name(db, cid).lower().replace(' ', '')
        code = dbutil.get_company_code(db, cid)
        company['id'] = code

        # short name
        alias.add(name.lower())
        alias.add(''.join(lazy_pinyin(name.lower())))
        # full name
        full = dbutil.get_company_corporate_name(db, cid, False)
        if full and full.strip():
            alias.add(full.lower())
            alias.add(full.lower().replace(u'北京',
                                           '').replace(u'上海', '').replace(
                                               u'深圳', '').replace(u'成都', ''))
        # artifact name
        aresults = dbutil.get_artifact_idname_from_cid(db, cid, True)
        if aresults:
            alias.update([
                self.valid_name(aname) for _, aname in aresults
                if self.valid_name(aname)
            ])
        # alias
        aliass = dbutil.get_alias_idname(db, cid)
        if aliass and len(aliass) < 20:
            alias.update([
                self.valid_name(aname) for _, aname in aliass
                if self.valid_name(aname)
            ])
        # corporate name
        corporate = dbutil.get_company_corporate_name(db, cid)
        if corporate and corporate.strip():
            alias.add(corporate.lower())
        # corporate full name
        corporate_full = dbutil.get_company_corporate_name(db, cid, False)
        if corporate_full and corporate_full.strip():
            alias.add(corporate_full.lower())
        # corporate alias
        corporate_alias = dbutil.get_corporate_alias(db, cid)
        if corporate_alias and len(corporate_alias) < 20:
            alias.update([
                self.valid_name(aname) for aname in corporate_alias
                if self.valid_name(aname)
            ])
        # check if there is a relevant digital coin
        dt = dbutil.get_company_digital_coin_info(db, cid)
        if dt:
            alias.add(dt.symbol.lower())
            # short name
            if dt.name:
                alias.add(dt.name.lower().replace(' ', ''))
            # english name
            if dt.enname:
                alias.add(dt.enname.lower())

        # create indice names
        company['name'] = name.lower()
        company['alias'] = self.analyze_names(alias)

        # tag
        tags_info = dbutil.get_company_tags_idname(db,
                                                   cid,
                                                   tag_out_type=(11000, 11001,
                                                                 11002))
        if tags_info:
            for tid, tname, weight in tags_info:
                company.setdefault('tags', []).append(tname.lower())
                company.setdefault('features', []).append(tid)
        company['nested_tag'] = []
        for industry in dbutil.get_company_industries(db, cid):
            company.setdefault('nested_tag', []).append({
                'id': industry.industryId,
                'published': industry.publishTime,
                "category": "industry"
            })
        for topic in dbutil.get_company_topics(db, cid):
            msg_publish = dbutil.get_topic_message_company_publish(db, topic)
            company.setdefault('nested_tag', []).append({
                'id': topic.topicId,
                'published': msg_publish,
                "category": "topic"
            })
            topic_tag = self.topic_tags.get(topic.topicId)
            if topic_tag:
                company.setdefault('tags', []).append(topic_tag.lower())
        sectors = dbutil.get_company_sector_tag(db, cid)
        company['sector'] = sectors

        # description
        desc = dbutil.get_company_solid_description(db, cid)
        if desc and desc.strip():
            desc = filter(lambda x: (x not in self.stopwords) and len(x) > 1,
                          list(self.seg.cut4search(desc)))
            company['description'] = (' '.join(desc)).lower()

        # round and investors and members
        round = dbutil.get_company_round(db, cid)
        company['round'] = 1000 if round == 0 else round
        company['sort_round'] = dbutil.get_round_sort(db, company.get('round'))
        status = dbutil.get_company_status(db, cid)
        if status in {2020, 2025}:
            company['status'] = status
        elif dbutil.get_company_ipo_status(db, cid):
            company['status'] = -1
        else:
            company['status'] = -2
        company['investors'] = dbutil.get_company_investor_names(db, cid)
        company['investorId'] = dbutil.get_company_investors(db, cid)
        company['members'] = [
            name for _, name in dbutil.get_member_idname(db, cid)
        ]

        # location
        lid, lname = dbutil.get_company_location(db, cid)
        company['location'] = lid

        # establish date, create date, count of company message
        establish_date = dbutil.get_company_establish_date(db, cid)
        try:
            company['established'] = int(establish_date.strftime('%Y%m'))
        except Exception, e:
            pass
Ejemplo n.º 8
0
        if lfd:
            if lfd.fundingDate:
                company['last_funding_date'] = lfd.fundingDate
            if lfd.investment:
                company['last_funding_amount'] = (lfd.investment * {
                    'Y': 1,
                    'N': 5
                }.get(lfd.precise, 1)) / 10000
        company['fa_date'] = dbutil.get_company_latest_fa_date(db, cid)
        company['num_cm'] = len(list(dbutil.get_company_messages(db, cid,
                                                                 "Y")))

        # sort value
        company['sort_sector'] = dbutil.get_tag_novelty(
            db, sectors[0]) if len(sectors) > 0 else None
        company['sort_location'] = dbutil.get_company_location(db, cid,
                                                               True)[1]

        # create index
        # print company
        self.create_index(company, 'universal', code)

    def create_index(self, item, doc, iid=None):

        iid = iid if iid else item.get('id')
        if iid:
            self.es.index(index="xiniudata2", doc_type=doc, id=iid, body=item)

    def valid_name(self, name):

        name = name.replace(u'・', u'-').replace(u'-', u'-').split(u'-')[0]
        if len(name) < 20: