예제 #1
0
def classify_android_black():

    global logger_yl, producer_tag
    init_kafka()
    db = dbcon.connect_torndb()
    # for cid, score in black_android_all().iteritems():
    # for cid, aid, score in recent_android_increase_rapidly_all():
    for cid, aid, score, source in dbutil.get_android_explosion(db):
        if dbutil.get_company_establish_date(db, cid).year < 2006:
            continue
        try:
            # 309126 下载激增
            dbutil.update_company_tag(db, cid, 309126, score, "Y")
            dbutil.mark_android_explosion(db, aid)
            dbutil.update_company_tag_comment(db, cid, 309126, 30, aid, source)
            msg = u'%s旗下Android产品近期下载量激增' % dbutil.get_company_name(db, cid)
            dbutil.update_continuous_company_message(db, cid, msg, 3201, 30,
                                                     aid, 14, source)
            producer_msg = {"id": cid}
            producer_tag.send_messages("keyword_v2", json.dumps(producer_msg))
            logger_yl.info(
                'Android Explosion Artifact: company %s, artifact %s' %
                (cid, aid))
        except Exception, e:
            logger_yl.exception(
                'Failed Android Explosion Artifact: company %s, artifact %s ' %
                (cid, aid))
예제 #2
0
def main():
    do = Download_Optimization()
    for cid, score in do.get_nice_download_cids():
        # 566646 下载优秀
        logger_apkdownload.info("insert into company_tag_rel: companyid %s, tagid 566646", cid)
        dbutil.update_company_tag(do.db, cid, 566646, round(score, 4), "Y")
    do.db.close()
예제 #3
0
파일: key.py 프로젝트: yujiye/Codes
    def redirect(self):

        for replacement in self.mongo.keywords.replacement.find({'active': 'Y'}):
            source, replace = replacement.get('source'), replacement.get('replacement')
            dbutil.update_tag_type(self.db, source, 11003, with_tag_id=True)
            for cid in dbutil.get_company_from_tag(self.db, source):
                dbutil.update_company_tag(self.db, cid, source, 0, active='N')
                for rtid in replace:
                    dbutil.update_company_tag(self.db, cid, rtid, 1, active='Y')
예제 #4
0
파일: key.py 프로젝트: yujiye/Codes
    def replace(self):

        for replacement in self.mongo.keywords.replacement.find({'active': 'Y'}):
            source = replacement.get('source')
            replaces = replacement.get('replacement')
            if len(replaces) > 1 and dbutil.get_tag_info(self.db, source, 'type') >= 11010:
                for c in reduce(lambda x, y: x & y,
                                [set(dbutil.get_company_from_tag(self.db, replace)) for replace in replaces]):
                    dbutil.update_company_tag(self.db, c, source, 1.503, 'P')
예제 #5
0
def classify_recuit_black():

    global logger_yl
    db = dbcon.connect_torndb()
    for cid, score in summary_recruit_all():
        try:
            # 309127 招聘活跃
            dbutil.update_company_tag(db, cid, 309127, score, "Y")
            logger_yl.info('Black recruit: %s insert' % cid)
        except Exception, e:
            logger_yl.exception('Black recruit: %s failed # %s' % (cid, e))
예제 #6
0
def classify_angel():

    global logger_yl
    db = dbcon.connect_torndb()
    known_angels = dicts.get_known_angels()
    dbutil.clear_tag(db, 309129)
    for cid in dbutil.get_companies_from_investors(db, *known_angels):
        # 309129 知名风投
        dbutil.update_company_tag(db, cid, 309129, 1, "Y")
        logger_yl.info('Known angel: %s insert' % cid)
    db.close()
예제 #7
0
    def update_tag_11110(self, todays):

        dbutil.clear_label(self.db, 579409, 579410)
        for item in todays:
            if item.get('rank', 10000) < 51:
                tid = 579409 if item.get('genre') is None else 579410
                for aid in dbutil.get_artifacts_from_iOS(self.db, item['trackId']):
                    cid = dbutil.get_artifact_company(self.db, aid)
                    dbutil.update_company_tag(self.db, cid, tid, 0, active='H')
                    detailid = '%s,%s' % (item['genre'], item['type'])
                    dbutil.update_company_tag_comment(self.db, cid, tid, 30, aid, detailid)
                    self.send_topic_company_msg(cid, False)
예제 #8
0
def classify_artifact_fast_iter():

    # 644378, 产品迭代快
    global logger_yl
    db = dbcon.connect_torndb()
    for (cid, score) in dbutil.get_fast_iter_artifact(db):
        try:
            dbutil.update_company_tag(db, cid, 644378, score, 'P', 'N')
            logger_yl.info('Artifact iter fast: company %s' % cid)
        except Exception, e:
            logger_yl.exception('Failed Artifact fast iteration, company %s' %
                                cid)
예제 #9
0
파일: general.py 프로젝트: yujiye/Codes
    def label(self, cid):

        dbutil.clear_company_tag(self.db, cid, 579089)
        general_tags = []
        for tag in self.__extract_11120(cid):
            dbutil.update_company_tag(self.db, cid, tag, 0, active='H')
            general_tags.append(tag)
        if 589015 in general_tags:
            dbutil.update_company_tag_comment(
                self.db, cid, 589015, 80,
                dbutil.get_company_latest_fa(self.db, cid))
        return general_tags
예제 #10
0
파일: key.py 프로젝트: yujiye/Codes
    def update_vip_tags(self, cid, support_tags, source_tags):

        vips = {}
        support_tag_ids = set(dbutil.get_tag_id(self.db, tag)[0] for tag in support_tags)
        support_vips = {self.vip_tags.get(tag): weight
                        for tag, weight in support_tags.items() if tag in self.vip_tags.keys()}
        for support_vip, support_weight in support_vips.iteritems():
            hyponyms = dbutil.get_hyponym_tags(self.db, support_vip)
            support_vips[support_vip] = support_weight + len(set(hyponyms) & support_tag_ids)
        source_vips = [self.vip_tags.get(tag) for tag in source_tags if tag in self.vip_tags.keys()]
        # desc = ' '.join(self.wfilter([x[0] for x in self.tagger.tag(self.feeder.feed_string(cid, 'with_tag'))]))
        desc = ' '.join(self.wfilter(self.seg.cut4search(self.feeder.feed_string(cid, 'with_tag'))))
        if not desc:
            desc = u'其他'
        # print desc
        classifier_vips = {int(tag.replace(u'__label__', '')): weight for (tag, weight) in
                           self.vip_classifier.predict_proba([desc], 3)[0] if weight > self.vip_lower}
        traditional = self.traditional_classifier.predict_proba([desc], 1)[0][0]
        if source_vips:
            for rank, vip in enumerate(sorted([t for t in source_vips if t in classifier_vips],
                                              key=lambda x: support_vips.get(x, 0)+classifier_vips.get(x, 0),
                                              reverse=True)):
                vips[vip] = 2.999 - round(rank/10.0, 1)
        if traditional[0].replace('__label__', '') == '1' and traditional[1] > 0.6:
            vips[604330] = round(2 + traditional[1], 2)
        elif len(vips) > 1:
            pass
        else:
            vip_candidates = sorted((set(support_vips.keys()) | set(classifier_vips.keys())),
                                    key=lambda x: -support_vips.get(x, 0.1)*0.1*classifier_vips.get(x, 0.01))
            if len(vip_candidates) == 0:
                pass
            elif len(vip_candidates) == 1:
                vip = vip_candidates[0]
                vips[vip] = max(2.9, round(2 + support_vips.get(vip, 0.01) * classifier_vips.get(vip, 0.01), 2))
            else:
                for rank, vip in enumerate(vip_candidates):
                    # print rank, vip, support_vips.get(vip, 0.01), classifier_vips.get(vip, 0.01)
                    rank_discount = {0: 1,
                                     1: 0.3}.get(rank, 0.2)
                    if support_vips.get(vip, 0.01)*rank_discount + classifier_vips.get(vip, 0.01) > self.vip_threshold:
                        vips[vip] = round(max(2.9-rank*0.01,
                                              2+support_vips.get(vip, 0.01)*classifier_vips.get(vip, 0.01)), 2)
        for tid, weight in vips.items():
            if self.replacements.get(tid):
                for rtid in self.replacements.get(tid, []):
                    dbutil.update_company_tag(self.db, cid, rtid, weight)
            else:
                dbutil.update_company_tag(self.db, cid, tid, weight)
            # print dbutil.get_tag_info(self.db, tid, 'name'), tid, weight
        return vips.keys()
예제 #11
0
파일: basic_track.py 프로젝트: yujiye/Codes
    def fit_tag(self):

        # update tag type
        if self.tag:
            original_type = dbutil.get_tag_info(self.db, self.tag, 'type')
            if original_type and original_type < 11011:
                dbutil.update_tag_type(self.db,
                                       self.tag,
                                       11011,
                                       with_tag_id=True)
        # update company tag
        for tpc in dbutil.get_industry_companies(self.db, self.idid):
            if self.tag:
                dbutil.update_company_tag(self.db,
                                          tpc.companyId,
                                          self.tag,
                                          1.502,
                                          verify='P')
예제 #12
0
파일: key.py 프로젝트: yujiye/Codes
    def infer_hierarchically(self):

        global logger_tag
        for t2 in dbutil.get_sectored_tags(self.db, 2):
            t3s = dbutil.get_tags_by_relation(self.db, t2.id, 54041)
            if t3s:
                check_point = datetime.now() - timedelta(hours=2)
                t1s = dbutil.get_hypernym_tags(self.db, t2.id, 1)
                hierachicals = (set(dbutil.get_company_from_tags(self.db, list(t3s))) &
                                set(dbutil.get_company_from_tags(self.db, t1s)))
                if len(hierachicals) > 2500:
                    dbutil.clear_company_common_tag(self.db, t2.id, check_point)
                    logger_tag.exception('Hierachical cross threshold, %s, %s' % (t2.name, len(hierachicals)))
                else:
                    for c in hierachicals:
                        dbutil.update_company_tag(self.db, c, t2.id, 1.504, 'P')
                    dbutil.clear_company_common_tag(self.db, t2.id, check_point)
                    logger_tag.info('Hierachically processed %s' % t2.name)
예제 #13
0
파일: key.py 프로젝트: yujiye/Codes
    def review(self, cid, contents):

        global logger_tag
        # load active tags
        tags = {t.tid: t for t in dbutil.get_company_tags_info(self.db, cid)}
        chains, merged = [], set()
        chain_candidates = {}
        for rel in dbutil.analyze_tags_relations(self.db, tags.keys(), 54041):
            chain_candidates.setdefault(rel.tagId, []).append(rel.tag2Id)
        for t1, t2s in chain_candidates.iteritems():
            for t2 in t2s:
                if (t1, t2) in merged:
                    continue
                if tags.get(t1).get('sector', False) == 1 and chain_candidates.get(t2, False):
                    for t3 in chain_candidates.get(t2, []):
                        chains.append([t1, t2, t3])
                        merged.add((t2, t3))
                else:
                    chains.append([t1, t2])
        chains = {index: [tags.get(tid).get('name') for tid in chain] for index, chain in enumerate(chains)}
        if len(chains) == 0:
            return

        # support selection
        supports = {}
        for index, (content, dweight) in enumerate(contents):
            for word in self.wfilter([x[0] for x in self.tagger.tag(content)]):
                if word not in self.w2v:
                    continue
                for index, chain in chains.iteritems():
                    similarity = self.w2v.n_similarity([tag for tag in chain if tag in self.w2v], [word])
                    if similarity > self.chain_simi_threshold:
                        supports[index] = supports.get(index, 0) + dweight * similarity
        major = sorted(supports.iteritems(), key=lambda x: -x[1])[0][0]
        self.mongo.keywords.majorchain.update({'company': cid}, {'company': cid, 'major': chains.get(major)}, True)

        # delete outliers
        (outliers1, outliers2) = self.__detect_outliers(tags.values())
        if len([t.tid for t in tags.itervalues() if t.type in (11012, 11013)]) > 5:
            if outliers2:
                delete2 = sorted(outliers2, key=lambda x: self.thesaurus_ids.get(x, 4))[0]
                logger_tag.info('delete outlier %s of %s' % (delete2, cid))
                dbutil.update_company_tag(self.db, cid, delete2, 0, active='N')
예제 #14
0
파일: key.py 프로젝트: yujiye/Codes
    def infer_rules(self):

        global logger_tag
        for t in dbutil.get_ruled_tags(self.db):
            logger_tag.info('Processing rule for %s' % t.name)
            try:
                rule = t.rule.replace(u',', u',').replace(u'(', u'(').replace(u')', u')').replace(u' ', u'').lower()
                rule = generate_rule_based_query(rule)
                if rule:
                    codes = self.client.search('topic', query=rule).get('company', {}).get('data', [])
                    if len(codes) > 2000:
                        logger_tag.exception('To many results, %s, %s' % (t.name, len(codes)))
                    else:
                        logger_tag.info('%s processed' % t.name)
                        for code in codes:
                            cid = dbutil.get_id_from_code(self.db, code)
                            if not dbutil.exist_company_tag(self.db, cid, t.id):
                                dbutil.update_company_tag(self.db, cid, t.id, 1.505)
            except Exception, e:
                logger_tag.exception('Fail to process tag rules %s, due to %s' % (t.name, e))
예제 #15
0
def classify_founder():

    global logger_yl
    db = dbcon.connect_torndb()
    fs = FounderScorer()
    for cid in iter(dbutil.get_all_company_id(db)):
        score = fs.score(cid)
        if score >= 0.5:
            # 309128 团队优秀
            dbutil.update_company_tag(db, cid, 309128, score, "Y")
            logger_yl.info('Outstanding team: %s insert' % cid)
        # if fs.has_QBFJ(cid):
        #     # 清北复交团队
        #     logger_yl.info('Has QBFJ: %s insert' % cid)
        # if fs.has_overseas(cid):
        #     # 海归团队
        #     logger_yl.info('Has overseas: %s insert' % cid)
        # if fs.has_serial_entrepreneur(cid):
        #     # 连续创业者
        #     logger_yl.info('Has serial entrepreneur' % cid)
    db.close()
예제 #16
0
def label_blockchain():

    db = dbcon.connect_torndb()
    feeder = Feeder()
    w2v = Word2Vec.load(word2vec_model)
    model_dir = os.path.join(os.path.split(os.path.realpath(__file__))[0], 'models')
    clf = joblib.load(os.path.join(model_dir, '175747.20180311.model'))
    for cid in dbutil.get_all_company_id(db):
        print cid
        flag = False
        try:
            content = list(feeder.feed_seged(cid))
            content = [np.mean([w2v[w] for w in content if w in w2v], axis=0)]
            if u'区块链' not in content:
                if clf.predict_proba(content)[0][1] > 0.9:
                    dbutil.update_company_tag(db, cid, 175747, 2.806, verify='N', active='Y')
                    flag = True
            else:
                if clf.predict(content)[0] == 1:
                    dbutil.update_company_tag(db, cid, 175747, 2.806, verify='N', active='Y')
                    flag = True
            if dbutil.exist_company_tag(db, cid, 175747) and not flag:
                dbutil.update_company_tag(db, cid, 175747, 0, verify='N', active='N')
        except Exception, e:
            print 'Fail to classify, due to %s', e
예제 #17
0
def do():
    end_date= datetime.today()
    start_date = end_date - timedelta(30)
    hot_list = get_hot_list(start_date, end_date)
    # 媒体热议 tagid
    db.execute("delete from company_tag_rel where tagId=573515 and (verify is null or verify = 'N');")

    # 媒体热议检测列表 collectionid 1822
    db.execute("delete from collection_company_rel where collectionid = 1822;")
    for cid in hot_list:
        r = dbutil.update_company_tag(db, cid, 573515, 9.0001)
        logger_media.info('Hot_media: %s insert' % cid)

        if r:
            db.execute("insert into collection_company_rel(collectionid, companyid, active, createuser, createtime) values (1822, %s, 'Y', 139, now());", cid)
예제 #18
0
파일: key.py 프로젝트: yujiye/Codes
    def update_contents_tags(self, cid, tags, source_tags, vips, topn):

        """
        normalize contents based tags and update mysql
        """

        old_tags = dbutil.get_company_tags_old(self.db, cid)
        new_tags = []
        for tag, weight in sorted(tags.items(), key=lambda x: -x[1])[:topn]:
            tid, active = dbutil.get_tag_id(self.db, tag)
            if tag in source_tags:
                weight += 0.009
            if tid in self.vip_tags.values():
                continue
            if active:
                new_tags.append(tid)
            if self.replacements.get(tid):
                new_tags.remove(tid)
                for rtid in self.replacements.get(tid, []):
                    if rtid in self.vip_tags.values():
                        continue
                    dbutil.update_company_tag(self.db, cid, rtid, weight, active=active)
                    new_tags.append(rtid)
            else:
                dbutil.update_company_tag(self.db, cid, tid, weight, active=active)
        # add classifed tags
        try:
            content = list(self.feeder.feed_seged(cid))
            if u'区块链' not in content:
                pass
            else:
                content = [np.mean([self.w2v[w] for w in content if w in self.w2v], axis=0)]
                for tid, clf in self.trained_tag_clfs.iteritems():
                    if clf.predict(content)[0] == 1:
                        dbutil.update_company_tag(self.db, cid, tid, 2.806, verify='N', active='Y')
                        new_tags.append(tid)
        except Exception, e:
            logger_tag.exception('Fail to classify, due to %s', e)
예제 #19
0
파일: basic_track.py 프로젝트: yujiye/Codes
    def fit_tags(self):

        # check_point = datetime.now() - timedelta(hours=2)
        for tpc in dbutil.get_topic_companies(self.db, self.tpid):
            for tid in self.tags:
                dbutil.update_company_tag(self.db, tpc.companyId, tid, 1.502)
예제 #20
0
    def __process_step1(self, record):

        global producer_news_task

        # not news
        if record.get('processStatus', 0) != 1:
            self.mongo.article.news.update(
                {'_id': ObjectId(record['news_id'])},
                {'$set': {
                    'processStatus': -1
                }})
            return
        if record.get('source', 0) == 'gongshang':
            return

        # update article news
        category = self.__map_category(record.get('categories', []))
        cids = record.get('companyIds', [])
        iids = record.get('investorIds', [])
        if category:
            self.mongo.article.news.update(
                {'_id': ObjectId(record['news_id'])},
                {'$set': {
                    'category': category
                }})
        self.mongo.article.news.update({'_id': ObjectId(record['news_id'])}, {
            '$set': {
                'companyIds': cids,
                'investorIds': iids,
                'modifyUser': record['modifyUser'],
                'categories': record.get('categories', [])
            }
        })

        # prepare features
        features = set()
        features.update(record.get('categories', []))
        features.add(record.get('sentiment'))
        # sector relevant features
        # orginal_features = [int(tid) for tid in
        #                     self.mongo.article.news.find_one({'_id': ObjectId(record['news_id'])}).get('features', [])]
        # industry_tags = [tid for tid in orginal_features if dbutil.get_tag_info(self.db, tid).type < 11050]
        # features.update(industry_tags)
        if {128, 578353, 578349, 578351, 578356, 578351} & set(
                record.get('categories', [])):
            features.update(record.get('newsTags', []))

        # generate step 2 task
        sector_update_flag = False
        if cids:
            startsups = filter(
                lambda cid: dbutil.get_company_round(self.db, cid) < 1041,
                cids)
            if startsups:
                news_tags = list(
                    chain(*[[
                        t.tid
                        for t in dbutil.get_company_tags_info(self.db, cid)
                        if t.verify and t.verify == "Y"
                    ] for cid in cids]))
                news_sectors = self.__map_sectors(news_tags)
                if len(news_sectors) <= 3:
                    sector_update_flag = True
                    features.update(news_tags)
                    self.mongo.article.news.update(
                        {'_id': ObjectId(record['news_id'])},
                        {'$set': {
                            'sectors': news_sectors
                        }})
        if not sector_update_flag:
            task2 = {
                'news_id': str(record['news_id']),
                'taskNewsId': str(record['_id']),
                'createTime': datetime.utcnow(),
                'newsTags': record.get('newsTags', []),
                'companyIds': cids,
                'processStatus': int(0),
                'section': 'step2'
            }
            if self.mongo.task.news.find({
                    'taskNewsId': str(record['_id'])
            }).count() == 0:
                self.mongo.task.news.insert_one(task2)

        # update article news
        self.mongo.article.news.update(
            {'_id': ObjectId(record['news_id'])},
            {'$set': {
                'processStatus': 1,
                'modifyTime': datetime.utcnow()
            }})
        self.__update_news_features(record['news_id'], features, 'skip')
        # re produce tags for mentioned companies
        for cid in cids:
            self.company_tagger.extract(cid, fast=True, update_only=True)
        # 大公司打上大公司标签
        if 578354 in features:
            for cid in cids:
                if self.mongo.article.news.find({
                        'processStatus': 1,
                        'companyIds': cid,
                        'features': 578354
                }).count() >= 3:
                    dbutil.update_company_tag(self.db,
                                              cid,
                                              599843,
                                              0,
                                              active='H')

        # track for company message and investor message
        for (feed_back,
             feed_type) in self.news_tracker.feed_1001_4tasks([record]):
            if feed_back:
                if feed_type == 'cm':
                    self.news_tracker.send_company_message_msg(feed_back)
                elif feed_type == 'im':
                    self.news_tracker.send_investor_message_msg(feed_back)

        # track for topic 30, 首次媒体报道
        self.track_topic_30(record)

        # send message to task company
        source = 'news_funding' if category == 60101 else 'news_regular'
        try:
            producer_news_task.send_messages(
                "task_company",
                json.dumps({
                    'source':
                    source,
                    'id':
                    record['news_id'],
                    'posting_time':
                    datetime.now().strftime('%Y-%m-%d:%H:%M:%S')
                }))
        except FailedPayloadsError, fpe:
            init_kafka()
            producer_news_task.send_messages(
                "task_company",
                json.dumps({
                    'source':
                    source,
                    'id':
                    record['news_id'],
                    'posting_time':
                    datetime.now().strftime('%Y-%m-%d:%H:%M:%S')
                }))
예제 #21
0
파일: key.py 프로젝트: yujiye/Codes
class Extractor(object):

    def __init__(self):

        global word2vec_model, viptag_model_20171221, viptag_model_traditional, logger_tag
        logger_tag.info('Extractor model initing')

        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(itags=True)
        self.wfilter = word_filter.get_default_filter()

        self.gang = GangTag()

        self.w2v = Word2Vec.load(word2vec_model)
        self.similarity_threshold = 0.4
        self.chain_simi_threshold = 0.25

        self.vip_tags = {t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1)}
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)
        self.traditional_classifier = fasttext.load_model(viptag_model_traditional)
        self.trained_tag_clfs = self.__load_trained_clfs()

        self.important_lower = 0.1
        self.important_threshold = 0.2
        self.relevant_threshold = 0.4
        self.vip_lower = 0.3
        self.vip_threshold = 0.25
        self.important_max_num = 5
        self.max_contents_length = 20

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.importants = set(t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_weighted_tags()
        self.thesaurus_ids = self.__load_weighted_tags(tid=True)
        self.junk_terms = self.__load_junk_tags()
        self.replacements = {r['source']: r['replacement'] for r in self.mongo.keywords.replacement.find()}

        self.trusted_sources = dicts.get_known_company_source()

        self.general_tagger = GeneralTagger()

        logger_tag.info('Extractor model inited')

    def __load_trained_clfs(self):

        model_dir = os.path.join(os.path.split(os.path.realpath(__file__))[0], 'models')
        return {175747: joblib.load(os.path.join(model_dir, '175747.20180311.model'))}

    def __extract_source_tag(self, cid):

        tags = dbutil.get_source_company_tags(self.db, cid, self.trusted_sources)
        if tags:
            return set(chain(*[dbutil.analyze_source_tag(self.db, tname, self.replacements)
                               for tname in tags if tname and tname.strip()]))
        return set([])

    def __extract_important(self, contents, source_tags=None):

        # candidates generation
        candidates = {} if not source_tags else {}.fromkeys(source_tags, 1)
        for content, weight in contents:
            for tag in [x[0] for x in self.tagger.tag(content) if x[1] == 'itag' or x[0] in self.importants]:
                candidates[tag] = candidates.get(tag, 0) + weight
        if len(candidates) < 1:
            return {}

        # support assignment
        content_length = 0
        supports = {}
        for index, (content, dweight) in enumerate(contents):
            for word in self.wfilter([x[0] for x in self.tagger.tag(content)]):
                if word not in self.w2v:
                    continue
                content_length += 1
                for candidate in candidates.keys():
                    if candidate not in self.w2v:
                        continue
                    similarity = self.w2v.similarity(candidate, word)
                    if similarity > self.similarity_threshold:
                        supports.setdefault(candidate, []).append((index, dweight, similarity))
        # for k, v in supports.iteritems():
        #     print k, v

        # support selection
        results = {}
        csize = len(candidates)
        for candidate, v in supports.iteritems():
            # if (csize >= 2) and \
            #         (sum([y[1] for y in set([(x[0], x[1]) for x in v])]) < min(6, ceil(float(len(contents))/3))):
            #     continue
            support = sum([round(item[1]*item[2], 2) for item in v])
            if csize >= 2 and sum([round(item[2], 2) for item in v]) < content_length / 20:
                continue
            results[candidate] = support * self.thesaurus.get(candidate, 1)
        if len(results) == 0:
            return results

        # normalization
        normalizer = max(results.values())
        for k, v in results.items():
            # if round(v/normalizer, 2) < self.important_lower:
            #     continue
            results[k] = round(v/normalizer, 2)

        # narrow down results size
        if len(results) < 4:
            pass
        else:
            results = dict(filter(lambda x: x[1] > self.important_threshold, results.iteritems()))
            if len(results) > self.important_max_num:
                size = min(10, max(int(ceil(len(results)/2.0)), self.important_max_num))
                results = dict(sorted(results.iteritems(), key=lambda x: -x[1])[:size])

        return results

    def __extract_vectorrank(self, contents):

        pass

    def __extract_textrank(self, contents, topn=15):

        """
        weighted textrank, weights use tags' novelties
        """

        global textrank_window_size, textrank_threshold

        candidates = []
        for content, _ in contents:
            candidates.extend([x[0] for x in self.tagger.tag(content)])
        # filter
        candidates = self.wfilter(candidates)
        # print ' '.join(candidates)
        if len(candidates) < 5:
            return

        graph = UndirectWeightedGraph()
        weights = collections.defaultdict(int)

        for i in xrange(len(candidates)):
            for j in xrange(i+1, i+textrank_window_size):
                if j >= len(candidates):
                    break
                weights[(candidates[i], candidates[j])] += 1
        for terms, weight in weights.iteritems():
            graph.add_edge(terms[0], terms[1], weight)
        nodes_rank = graph.rank(self.thesaurus)
        index = min(topn, len(candidates))
        start = 0
        for tag, weight in sorted(nodes_rank.items(), key=lambda x: -x[1])[:index]:
            if tag in self.junk_terms:
                continue
            if start < 2:
                yield tag, round(weight, 2)
            elif weight >= textrank_threshold:
                yield tag, round(weight, 2)
            start += 1

    def extract(self, cid, topn=15, fast=False, update_only=False):

        # general tag
        new_general = self.general_tagger.label(cid)
        if new_general:
            logger_tag.info('General Tag of %s, %s' % (cid, ','.join([str(tid) for tid in new_general])))

        contents = list(self.feeder.feed(cid, quanlity='medium'))
        results = {}
        if len(contents) > self.max_contents_length:
            contents = sorted(contents, key=lambda x: -x[1])[:self.max_contents_length]
        # source tags
        source_tags = self.__extract_source_tag(cid)
        # print ','.join(source_tags)
        # results = self.merge(results, {}.fromkeys(source_tags, 0.5))
        # important tag
        results = self.merge(results, self.__extract_important(contents, source_tags), 1)
        # regular tag
        results = self.merge(results, dict(self.__extract_textrank(contents, topn)))
        # verified tag
        results = self.merge(results, dict.fromkeys(dbutil.get_company_tags_verified(self.db, cid), 1))
        # topic tag
        results = self.merge(results, dict.fromkeys(dbutil.get_company_topics_tags(self.db, cid), 1.5))
        # normalize
        results = self.__normalize(results)
        # vip tags
        vips = self.update_vip_tags(cid, results, source_tags)
        # update contents based tags
        results = self.__normalize_replacement(results)
        try:
            new_tags, remove_tags = self.update_contents_tags(cid, results, source_tags, vips, topn)
        except Exception, e:
            new_tags, remove_tags = [], []
            logger_tag.info('Fail to update contents tags, %s, %s' % (cid, e))
        if not update_only:
            for remove_tag in remove_tags:
                dbutil.update_company_tag(self.db, cid, remove_tag, 0, active="N")
        logger_tag.info('Processed %s, new tags %s, removed %s' % (cid, ','.join([str(tid) for tid in new_tags]),
                                                                   ','.join([str(tid) for tid in remove_tags])))

        # process gang tag 派系标签
        gangtag_ids = self.gang.predict(cid)
        for gangtagid in gangtag_ids:
            dbutil.update_company_tag(self.db, cid, gangtagid, 1.001)

        try:
            self.review(cid, contents)
        except Exception, e:
            logger_tag.exception('Review failed, %s, due to %s' % (cid, e))