Ejemplo n.º 1
0
def dump():

    db = dbcon.connect_torndb()
    with codecs.open('dumps/tags.rel', 'w', 'utf-8') as fo:
        for t in dbutil.get_tags_by_type(db, [11010, 11011, 11012, 11013]):
            rels = [(dbutil.get_tag_info(db, r.tag2Id, 'name'), dbutil.get_tag_info(db, r.tag2Id, 'type'))
                    for r in dbutil.get_tags_rel(db, t.id, type=54020)]
            fo.write('%s\t%s\t%s\n' % (t.type, t.name, ','.join([x[0] for x in rels if x[1] == 11011])))
            fo.write('%s\t\t%s\n' % (t.type, ','.join([x[0] for x in rels if x[1] == 11013])))
            fo.write('%s\t\t%s\n' % (t.type, ','.join([x[0] for x in rels if x[1] == 11010])))
            fo.write('%s\t\t%s\n' % (t.type, ','.join([x[0] for x in rels if x[1] == 11000])))
    db.close()
Ejemplo n.º 2
0
Archivo: key.py Proyecto: yujiye/Codes
    def replace(self):

        for replacement in self.mongo.keywords.replacement.find({'active': 'Y'}):
            source = replacement.get('source')
            replaces = replacement.get('replacement')
            if len(replaces) > 1 and dbutil.get_tag_info(self.db, source, 'type') >= 11010:
                for c in reduce(lambda x, y: x & y,
                                [set(dbutil.get_company_from_tag(self.db, replace)) for replace in replaces]):
                    dbutil.update_company_tag(self.db, c, source, 1.503, 'P')
Ejemplo n.º 3
0
    def update_relevant_tags(self, tid, t_type=None, t_name=None):

        if not t_type:
            t_type = dbutil.get_tag_info(self.db, tid, 'type')
        if not t_name:
            t_name = dbutil.get_tag_info(self.db, tid, 'name')
        for target_type in [11000, 11010, 11011, 11013]:
            candidates = [tag for tag, (_, t) in self.tags.items() if t == target_type]
            similarities = sorted([(tag, self.__get_similarity(t_name, tag)) for tag in candidates],
                                  key=lambda x: -x[1])[:self.max_candidates.get(target_type)]
            if len(filter(lambda x: x[1] > self.similarity_threshold, similarities)) == 0:
                similarities = []
            else:
                similarities = filter(lambda x: x[1] > self.similarity_threshold, similarities)
            for tag, weight in similarities:
                dbutil.update_tags_rel(self.db, tid, self.tags.get(tag)[0], weight, 54020)
        if t_type != 11012:
            candidates = [tag for tag, (_, t) in self.tags.items() if t == 11012]
            vip, weight = max([(tag, self.__get_similarity(t_name, tag)) for tag in candidates])
            dbutil.update_tags_rel(self.db, tid, self.tags.get(vip)[0], weight, 54020)
Ejemplo n.º 4
0
    def __process_step2(self, record):

        global producer_news_task

        # not news
        if record.get('processStatus', 0) != 1:
            self.mongo.article.news.update(
                {'_id': ObjectId(record['news_id'])},
                {'$set': {
                    'processStatus': -1
                }})
            return
        if record.get('source', 0) == 'gongshang':
            return

        # update article news
        news_tags = [int(tid) for tid in record.get('newsTags', [])]
        news_sectors = self.__map_sectors(news_tags)
        orginal_features = [
            int(tid) for tid in self.mongo.article.news.find_one({
                '_id':
                ObjectId(record['news_id'])
            }).get('features', [])
        ]
        industry_tags = [
            tid for tid in orginal_features
            if dbutil.get_tag_info(self.db, tid).type < 11050
        ]
        dup_industry_tags = [
            tid for tid in industry_tags if tid not in news_tags
        ]
        update_features = [
            tid for tid in orginal_features if tid not in dup_industry_tags
        ]
        self.mongo.article.news.update({'_id': ObjectId(record['news_id'])}, {
            '$set': {
                'sectors': news_sectors,
                'processStatus': 1,
                'modifyTime': datetime.utcnow(),
                'features': update_features
            }
        })
        # '$addToSet': {'features': {'$each': news_tags}}})

        # update tags as features
        features = record.get('newsTags', [])
        self.mongo.article.news.update(
            {'_id': ObjectId(record['news_id'])},
            {'$set': {
                'processStatus': 1,
                'modifyTime': datetime.utcnow()
            }})
        self.__update_news_features(record['news_id'], tn2_features=features)
Ejemplo n.º 5
0
    def identify(self, today=None):

        if not today:
            today = datetime.today()
        # growth yesterday
        start = datetime.fromordinal(
            (today - timedelta(days=2)).date().toordinal())
        df = pd.DataFrame(
            list(
                self.mongo.keywords.trend_statistc.find(
                    {'date': {
                        '$gte': start
                    }})))
        df = df.groupby(['tag', 'subtype'])['weight'].\
            agg({'growth': lambda weight: (max(weight)-min(weight)+1)/(min(weight)+1)})
        df.reset_index(inplace=True)
        df['rank'] = df.groupby('subtype')['growth'].rank(ascending=0,
                                                          method='first')
        df['name'] = df.apply(
            lambda x: dbutil.get_tag_info(self.db, x[0], 'name'), 1)
        df = df.loc[(df['rank'] < 5) & (df['growth'] > 1)]
        for _, row in df.iterrows():
            row = dict(row)
            if len(
                    list(
                        self.mongo.task.tag.find({
                            'type': 'trend',
                            'id': row.get('tag'),
                            'processStatus': 0
                        }))) > 0:
                self.mongo.task.tag.update(
                    {
                        'type': 'trend',
                        'id': row.get('tag'),
                        'processStatus': 0
                    }, {'$set': {
                        'modifyTime': datetime.utcnow()
                    }})
            else:
                self.mongo.task.tag.insert({
                    'type': 'trend',
                    'id': row.get('tag'),
                    'processStatus': 0,
                    'name': row.get('name'),
                    'createTime': datetime.utcnow(),
                    'modifyTime': datetime.utcnow(),
                    'reason': row.get('subtype')
                })
Ejemplo n.º 6
0
    def fit_tag(self):

        # update tag type
        if self.tag:
            original_type = dbutil.get_tag_info(self.db, self.tag, 'type')
            if original_type and original_type < 11011:
                dbutil.update_tag_type(self.db,
                                       self.tag,
                                       11011,
                                       with_tag_id=True)
        # update company tag
        for tpc in dbutil.get_industry_companies(self.db, self.idid):
            if self.tag:
                dbutil.update_company_tag(self.db,
                                          tpc.companyId,
                                          self.tag,
                                          1.502,
                                          verify='P')
Ejemplo n.º 7
0
    def memorize(self, tid, today=None):

        global logger_tt
        if not today:
            today = datetime.today()
        yesterday = today - timedelta(days=1)
        today_int = int(today.strftime('%Y%m%d'))
        tag = dbutil.get_tag_info(self.db, tid, 'name')

        logger_tt.info('Start to process %s' % tid)
        # relevant companies
        cids = dbutil.get_company_from_tags(self.db, [tid])
        codes = [dbutil.get_company_code(self.db, cid) for cid in cids]
        visits = self.mongo.log.user_log.find({
            'time': {
                '$gt': today - timedelta(hours=32),
                '$lte': today - timedelta(hours=8)
            },
            'requestURL': "/xiniudata-api/api2/service/company/basic",
            'jsonRequest.payload.code': {
                '$in': codes
            }
        })
        # visits = list(visits)
        # visits = [visit['jsonRequest']['payload']['code'] in codes for visit in visits]
        self.mongo.keywords.trend_statistc.update(
            {
                'tag': tid,
                'date': datetime.fromordinal(today.date().toordinal()),
                'subtype': 'company_visit'
            }, {'$set': {
                'type': 'company',
                'weight': len(list(visits))
            }}, True)
        subscriptions = dbutil.get_company_subscription_details(
            self.db, yesterday.strftime('%Y-%m-%d'),
            today.strftime('%Y-%m-%d'), *cids)
        self.mongo.keywords.trend_statistc.update(
            {
                'tag': tid,
                'date': datetime.fromordinal(today.date().toordinal()),
                'subtype': 'company_subscribe'
            }, {'$set': {
                'type': 'company',
                'weight': len(subscriptions)
            }}, True)
        # logger_tt.info('Company done')

        # relevant news
        news = self.search_client.search('general',
                                         input=tag,
                                         filters={
                                             'date': today_int
                                         },
                                         size=500).get('news', {})
        news = list(news.get('data', []))
        self.mongo.keywords.trend_statistc.update(
            {
                'tag': tid,
                'date': datetime.fromordinal(today.date().toordinal()),
                'subtype': 'news_relevant'
            }, {'$set': {
                'type': 'news',
                'weight': len(news)
            }}, True)
        # logger_tt.info('News searched')
        news_read = self.mongo.log.user_log.find({
            'time': {
                '$gt': today - timedelta(hours=32),
                '$lte': today - timedelta(hours=8)
            },
            'requestURL': self.news_read_url,
            'jsonRequest.payload.newsId': {
                '$in': news
            }
        })
        self.mongo.keywords.trend_statistc.update(
            {
                'tag': tid,
                'date': datetime.fromordinal(today.date().toordinal()),
                'subtype': 'news_read'
            }, {'$set': {
                'type': 'news',
                'weight': len(list(news_read))
            }}, True)
        # logger_tt.info('News done')

        # search
        search = self.mongo.log.search.find({
            'time': {
                '$gt': today - timedelta(hours=32),
                '$lte': today - timedelta(hours=8)
            },
            'query.input': tag,
            'userId': {
                '$ne': None
            }
        })
        self.mongo.keywords.trend_statistc.update(
            {
                'tag': tid,
                'date': datetime.fromordinal(today.date().toordinal()),
                'subtype': 'search_precise'
            }, {'$set': {
                'type': 'search',
                'weight': len(list(search))
            }}, True)