Exemple #1
0
    def __init__(self, model='default'):

        # self.segmenter = Segmenter()
        self.tagger = Tagger(cname=True)
        # self.extractor = Extractor(record=False)
        if model == 'easy':
            self.clf = joblib.load(
                os.path.join(
                    os.path.split(os.path.realpath(__file__))[0],
                    'dumps/backup/news.score.lrmodel'))
            self.vec = joblib.load(
                os.path.join(
                    os.path.split(os.path.realpath(__file__))[0],
                    'dumps/backup/news.featurizer'))
        else:
            self.clf = joblib.load(
                os.path.join(
                    os.path.split(os.path.realpath(__file__))[0],
                    'dumps/news.score.lrmodel'))
            self.vec = joblib.load(
                os.path.join(
                    os.path.split(os.path.realpath(__file__))[0],
                    'dumps/news.featurizer'))
        self.nf = NewsFeatures()
        self.db = dbcon.connect_torndb()
Exemple #2
0
    def __init__(self):

        self.db = dbcon.connect_torndb()
        self.tagger = Tagger()
        self.stopwords = stopword.get_standard_stopwords()
        self.source = json.load(codecs.open(os.path.join(os.path.split(os.path.realpath(__file__))[0],
                                                         'dumps/news.domain'), encoding='utf-8'))
        self.failed = 0
Exemple #3
0
def feed_doc_s(sid):

    mongo = dbcon.connect_mongo()
    tagger = Tagger(tags=True)
    wfilter = get_default_filter()
    for record in mongo.article.news.find({'sectors': sid}):
        yield chain(*[
            wfilter([
                w[0] for w in tagger.tag(piece['content'].strip())
                if w[1] in ('tag', 'itag')
            ]) for piece in record['contents'] if piece['content'].strip()
        ])
Exemple #4
0
    def __init__(self):

        global word2vec_model, viptag_model_20171221
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(tags=True)
        self.wfilter = word_filter.get_default_filter()

        self.w2v = Word2Vec.load(word2vec_model)
        self.trained_tag_clfs = self.__load_trained_clfs()
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.vip_tags = {
            t.name: t.id
            for t in dbutil.get_sectored_tags(self.db, 1)
        }
        self.hyponym = {
            vip_name: set([
                dbutil.get_tag_name(self.db, tid)
                for tid in dbutil.get_hyponym_tags(self.db, vip_id)
            ])
            for vip_name, vip_id in self.vip_tags.iteritems()
        }
        self.importants = set(
            t.name.lower()
            for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_tag_novelties()
        self.thesaurus_ids = self.__load_tag_novelties(tid=True)
        self.tag_types = self.__load_tag_types()
        self.trusted_sources = dicts.get_known_company_source()
        self.replacements = {
            dbutil.get_tag_name(self.db, r['source']):
            [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']]
            for r in self.mongo.keywords.replacement.find()
        }
        self.junk_terms = set(
            tag.name
            for tag in dbutil.get_tags_by_type(self.db, typeset=([11001])))

        self.similarity_threshold = 0.4
        self.textrank_window_size = 2
        self.textrank_threshold = 0
        self.source_tag_default_weight = 2
        self.vip_lower = 0.3
        self.important_threshold = 0.2
        self.important_max_count = 5

        print 'model inited'
Exemple #5
0
class NewsFeatures(object):

    def __init__(self):

        self.db = dbcon.connect_torndb()
        self.tagger = Tagger()
        self.stopwords = stopword.get_standard_stopwords()
        self.source = json.load(codecs.open(os.path.join(os.path.split(os.path.realpath(__file__))[0],
                                                         'dumps/news.domain'), encoding='utf-8'))
        self.failed = 0

    def featurize(self, cid, **kwargs):

        features = {}
        instance = dict(**kwargs)
        if instance.get('name') and instance.get('name_update', True):
            self.tagger.add_word(instance.get('name'), tag='cn')
        name, title, content, link = instance.get('name'), instance.get('title'), instance.get('content'), \
                                     instance.get('link')

        if title and title.strip():
            title = self.tagger.tag(title)
            matches = [x[0] for x in title if x[1] == 'cn']
            features['title_ne'] = matches.count(name)
        if content and content.strip():
            content = list(self.tagger.tag(content))
            # content ne
            matches = [x[0] for x in content if x[1] == 'cn']
            features['content_ne'] = round(float(matches.count(name))/len(content), 4)
            # features['content_length'] = len(content)
            # content similarity
            try:
                odesc = Counter([item[0] for item in self.tagger.tag(dbutil.get_company_solid_description(self.db, cid))
                                 if item[0].strip() and item[0] not in self.stopwords and len(item[0]) > 1 and
                                 not item[0].isnumeric()])
                idesc = Counter([item[0] for item in content if item[0].strip() and item[0] not in self.stopwords
                                 and len(item[0]) > 1 and not item[0].isnumeric()])
                length = max(min(50, len(idesc), len(odesc)), 20)
                odesc, idesc = odesc.most_common(length), idesc.most_common(length)
                odesc.extend([(x[1], x[2]*5) for x in dbutil.get_company_tags_idname(self.db, cid)])

                simi = weighted_jaccard(odesc, idesc)
                # if simi > 0.05:
                #     print simi, cid, instance
                features['content_simi'] = simi
            except:
                self.failed += 1
        # if link and link.strip():
        #     features['source'] = self.source.get(urlparse.urlparse(link).netloc, 0)

        return features
Exemple #6
0
    def __init__(self):

        global word2vec_model

        self.mongo = dbcon.connect_mongo()

        self.w2v = Word2Vec.load(word2vec_model)
        self.similarity_threshold = 0.4

        self.important_lower = 0.1
        self.important_threshold = 0.2
        self.important_max_num = 5

        self.tagger = Tagger(tags=True)
        self.wfilter = word_filter.get_default_filter()
Exemple #7
0
class RelatednessScorer(object):

    global logger_relate

    def __init__(self, model='default'):

        # self.segmenter = Segmenter()
        self.tagger = Tagger(cname=True)
        # self.extractor = Extractor(record=False)
        if model == 'easy':
            self.clf = joblib.load(
                os.path.join(
                    os.path.split(os.path.realpath(__file__))[0],
                    'dumps/backup/news.score.lrmodel'))
            self.vec = joblib.load(
                os.path.join(
                    os.path.split(os.path.realpath(__file__))[0],
                    'dumps/backup/news.featurizer'))
        else:
            self.clf = joblib.load(
                os.path.join(
                    os.path.split(os.path.realpath(__file__))[0],
                    'dumps/news.score.lrmodel'))
            self.vec = joblib.load(
                os.path.join(
                    os.path.split(os.path.realpath(__file__))[0],
                    'dumps/news.featurizer'))
        self.nf = NewsFeatures()
        self.db = dbcon.connect_torndb()

    def compare(self, cid, **kwargs):

        logger_relate.info('Compare news of company#%s' % cid)
        instance = dict(**kwargs)
        if instance.get('name'):
            self.tagger.add_word(instance.get('name'), tag='cn')
        name, title, content = instance.get('name'), instance.get(
            'title'), instance.get('content')

        if len(content) <= 20:
            return False, 0
        # print self.nf.featurize(cid, name=name, title=title, content=content)
        news = self.vec.transform(
            [self.nf.featurize(cid, name=name, title=title, content=content)])
        result = self.clf.predict_proba(news)[0]
        return (float(result[1]) > float(result[0])), round(result[1], 4)
Exemple #8
0
    def __init__(self):

        global word2vec_model, viptag_model_20171221, viptag_model_traditional, logger_tag
        logger_tag.info('Extractor model initing')

        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(itags=True)
        self.wfilter = word_filter.get_default_filter()

        self.gang = GangTag()

        self.w2v = Word2Vec.load(word2vec_model)
        self.similarity_threshold = 0.4
        self.chain_simi_threshold = 0.25

        self.vip_tags = {t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1)}
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)
        self.traditional_classifier = fasttext.load_model(viptag_model_traditional)
        self.trained_tag_clfs = self.__load_trained_clfs()

        self.important_lower = 0.1
        self.important_threshold = 0.2
        self.relevant_threshold = 0.4
        self.vip_lower = 0.3
        self.vip_threshold = 0.25
        self.important_max_num = 5
        self.max_contents_length = 20

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.importants = set(t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_weighted_tags()
        self.thesaurus_ids = self.__load_weighted_tags(tid=True)
        self.junk_terms = self.__load_junk_tags()
        self.replacements = {r['source']: r['replacement'] for r in self.mongo.keywords.replacement.find()}

        self.trusted_sources = dicts.get_known_company_source()

        self.general_tagger = GeneralTagger()

        logger_tag.info('Extractor model inited')
Exemple #9
0
class Extractor(object):

    def __init__(self):

        global word2vec_model, viptag_model_20171221, viptag_model_traditional, logger_tag
        logger_tag.info('Extractor model initing')

        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(itags=True)
        self.wfilter = word_filter.get_default_filter()

        self.gang = GangTag()

        self.w2v = Word2Vec.load(word2vec_model)
        self.similarity_threshold = 0.4
        self.chain_simi_threshold = 0.25

        self.vip_tags = {t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1)}
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)
        self.traditional_classifier = fasttext.load_model(viptag_model_traditional)
        self.trained_tag_clfs = self.__load_trained_clfs()

        self.important_lower = 0.1
        self.important_threshold = 0.2
        self.relevant_threshold = 0.4
        self.vip_lower = 0.3
        self.vip_threshold = 0.25
        self.important_max_num = 5
        self.max_contents_length = 20

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.importants = set(t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_weighted_tags()
        self.thesaurus_ids = self.__load_weighted_tags(tid=True)
        self.junk_terms = self.__load_junk_tags()
        self.replacements = {r['source']: r['replacement'] for r in self.mongo.keywords.replacement.find()}

        self.trusted_sources = dicts.get_known_company_source()

        self.general_tagger = GeneralTagger()

        logger_tag.info('Extractor model inited')

    def __load_trained_clfs(self):

        model_dir = os.path.join(os.path.split(os.path.realpath(__file__))[0], 'models')
        return {175747: joblib.load(os.path.join(model_dir, '175747.20180311.model'))}

    def __extract_source_tag(self, cid):

        tags = dbutil.get_source_company_tags(self.db, cid, self.trusted_sources)
        if tags:
            return set(chain(*[dbutil.analyze_source_tag(self.db, tname, self.replacements)
                               for tname in tags if tname and tname.strip()]))
        return set([])

    def __extract_important(self, contents, source_tags=None):

        # candidates generation
        candidates = {} if not source_tags else {}.fromkeys(source_tags, 1)
        for content, weight in contents:
            for tag in [x[0] for x in self.tagger.tag(content) if x[1] == 'itag' or x[0] in self.importants]:
                candidates[tag] = candidates.get(tag, 0) + weight
        if len(candidates) < 1:
            return {}

        # support assignment
        content_length = 0
        supports = {}
        for index, (content, dweight) in enumerate(contents):
            for word in self.wfilter([x[0] for x in self.tagger.tag(content)]):
                if word not in self.w2v:
                    continue
                content_length += 1
                for candidate in candidates.keys():
                    if candidate not in self.w2v:
                        continue
                    similarity = self.w2v.similarity(candidate, word)
                    if similarity > self.similarity_threshold:
                        supports.setdefault(candidate, []).append((index, dweight, similarity))
        # for k, v in supports.iteritems():
        #     print k, v

        # support selection
        results = {}
        csize = len(candidates)
        for candidate, v in supports.iteritems():
            # if (csize >= 2) and \
            #         (sum([y[1] for y in set([(x[0], x[1]) for x in v])]) < min(6, ceil(float(len(contents))/3))):
            #     continue
            support = sum([round(item[1]*item[2], 2) for item in v])
            if csize >= 2 and sum([round(item[2], 2) for item in v]) < content_length / 20:
                continue
            results[candidate] = support * self.thesaurus.get(candidate, 1)
        if len(results) == 0:
            return results

        # normalization
        normalizer = max(results.values())
        for k, v in results.items():
            # if round(v/normalizer, 2) < self.important_lower:
            #     continue
            results[k] = round(v/normalizer, 2)

        # narrow down results size
        if len(results) < 4:
            pass
        else:
            results = dict(filter(lambda x: x[1] > self.important_threshold, results.iteritems()))
            if len(results) > self.important_max_num:
                size = min(10, max(int(ceil(len(results)/2.0)), self.important_max_num))
                results = dict(sorted(results.iteritems(), key=lambda x: -x[1])[:size])

        return results

    def __extract_vectorrank(self, contents):

        pass

    def __extract_textrank(self, contents, topn=15):

        """
        weighted textrank, weights use tags' novelties
        """

        global textrank_window_size, textrank_threshold

        candidates = []
        for content, _ in contents:
            candidates.extend([x[0] for x in self.tagger.tag(content)])
        # filter
        candidates = self.wfilter(candidates)
        # print ' '.join(candidates)
        if len(candidates) < 5:
            return

        graph = UndirectWeightedGraph()
        weights = collections.defaultdict(int)

        for i in xrange(len(candidates)):
            for j in xrange(i+1, i+textrank_window_size):
                if j >= len(candidates):
                    break
                weights[(candidates[i], candidates[j])] += 1
        for terms, weight in weights.iteritems():
            graph.add_edge(terms[0], terms[1], weight)
        nodes_rank = graph.rank(self.thesaurus)
        index = min(topn, len(candidates))
        start = 0
        for tag, weight in sorted(nodes_rank.items(), key=lambda x: -x[1])[:index]:
            if tag in self.junk_terms:
                continue
            if start < 2:
                yield tag, round(weight, 2)
            elif weight >= textrank_threshold:
                yield tag, round(weight, 2)
            start += 1

    def extract(self, cid, topn=15, fast=False, update_only=False):

        # general tag
        new_general = self.general_tagger.label(cid)
        if new_general:
            logger_tag.info('General Tag of %s, %s' % (cid, ','.join([str(tid) for tid in new_general])))

        contents = list(self.feeder.feed(cid, quanlity='medium'))
        results = {}
        if len(contents) > self.max_contents_length:
            contents = sorted(contents, key=lambda x: -x[1])[:self.max_contents_length]
        # source tags
        source_tags = self.__extract_source_tag(cid)
        # print ','.join(source_tags)
        # results = self.merge(results, {}.fromkeys(source_tags, 0.5))
        # important tag
        results = self.merge(results, self.__extract_important(contents, source_tags), 1)
        # regular tag
        results = self.merge(results, dict(self.__extract_textrank(contents, topn)))
        # verified tag
        results = self.merge(results, dict.fromkeys(dbutil.get_company_tags_verified(self.db, cid), 1))
        # topic tag
        results = self.merge(results, dict.fromkeys(dbutil.get_company_topics_tags(self.db, cid), 1.5))
        # normalize
        results = self.__normalize(results)
        # vip tags
        vips = self.update_vip_tags(cid, results, source_tags)
        # update contents based tags
        results = self.__normalize_replacement(results)
        try:
            new_tags, remove_tags = self.update_contents_tags(cid, results, source_tags, vips, topn)
        except Exception, e:
            new_tags, remove_tags = [], []
            logger_tag.info('Fail to update contents tags, %s, %s' % (cid, e))
        if not update_only:
            for remove_tag in remove_tags:
                dbutil.update_company_tag(self.db, cid, remove_tag, 0, active="N")
        logger_tag.info('Processed %s, new tags %s, removed %s' % (cid, ','.join([str(tid) for tid in new_tags]),
                                                                   ','.join([str(tid) for tid in remove_tags])))

        # process gang tag 派系标签
        gangtag_ids = self.gang.predict(cid)
        for gangtagid in gangtag_ids:
            dbutil.update_company_tag(self.db, cid, gangtagid, 1.001)

        try:
            self.review(cid, contents)
        except Exception, e:
            logger_tag.exception('Review failed, %s, due to %s' % (cid, e))
Exemple #10
0
class SubSector(object):
    def __init__(self):

        global word2vec_model

        self.mongo = dbcon.connect_mongo()

        self.w2v = Word2Vec.load(word2vec_model)
        self.similarity_threshold = 0.4

        self.important_lower = 0.1
        self.important_threshold = 0.2
        self.important_max_num = 5

        self.tagger = Tagger(tags=True)
        self.wfilter = word_filter.get_default_filter()

    def train(self):

        pass

    def extract_tag(self, nid):

        return self.__extract_itag(nid)

    def __extract_itag(self, nid):

        contents = list(self.mongo.article.news.find({"_id":
                                                      nid}))[0]['contents']

        # candidates generation
        candidates = {}
        for content in contents:
            for tag in [
                    x[0] for x in self.tagger.tag(content['content'])
                    if x[1] == 'itag' and x[0] in self.w2v.vocab
            ]:
                candidates[tag] = candidates.get(tag, 0)

        # support assginment
        total = 0
        supports = {}
        for index, content in enumerate(contents):
            for word in self.wfilter(
                [x[0] for x in self.tagger.tag(content['content'])]):
                if word not in self.w2v.vocab:
                    continue
                total += 1
                for candidate in candidates.keys():
                    similarity = self.w2v.similarity(candidate, word)
                    if similarity > self.similarity_threshold:
                        supports.setdefault(candidate, []).append(
                            (index, 1, similarity))

        # support selection
        results = {}
        for candidate, v in supports.iteritems():
            # if (csize >= 2) and \
            #         (sum([y[1] for y in set([(x[0], x[1]) for x in v])]) < min(6, ceil(float(len(contents))/3))):
            #     continue
            support = sum([round(item[1] * item[2], 2) for item in v])
            results[candidate] = support
        if len(results) == 0:
            return results

        # normalization, max weight equals to 1
        normalizer = max(results.values())
        for k, v in results.items():
            # if round(v/normalizer, 2) < self.important_lower:
            #     continue
            results[k] = round(v / normalizer, 2)

        # narrow down results size
        if len(results) < 4:
            pass
        else:
            results = dict(
                filter(lambda x: x[1] > self.important_threshold,
                       results.iteritems()))
            if len(results) > self.important_max_num:
                size = min(
                    10,
                    max(int(ceil(len(results) / 2.0)), self.important_max_num))
                results = dict(
                    sorted(results.iteritems(), key=lambda x: -x[1])[:size])

        return results
Exemple #11
0
sys.path.append('..')
sys.setdefaultencoding('utf-8')

import torndb
import os
import codecs
import time
import re
from random import random
from common import nlpconfig
from common.zhtools import stopword
from common.zhtools import hants
from common.zhtools.postagger import Tagger


tagger = Tagger('ltp')
doc_len_threshold = 10
year = re.compile(u'\d+年')
month = re.compile(u'\d+月')
day = re.compile(u'\d+日')
stopwords = stopword.get_standard_stopwords()


class Corpus(object):

    def __init__(self, extrc_func=None, dirs=None):

        self.extract = extrc_func if extrc_func else lambda x: x.strip()
        self.dirs = dirs

    def __iter__(self):
Exemple #12
0
class KeywordExtractor(object):
    def __init__(self):

        global word2vec_model, viptag_model_20171221
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(tags=True)
        self.wfilter = word_filter.get_default_filter()

        self.w2v = Word2Vec.load(word2vec_model)
        self.trained_tag_clfs = self.__load_trained_clfs()
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.vip_tags = {
            t.name: t.id
            for t in dbutil.get_sectored_tags(self.db, 1)
        }
        self.hyponym = {
            vip_name: set([
                dbutil.get_tag_name(self.db, tid)
                for tid in dbutil.get_hyponym_tags(self.db, vip_id)
            ])
            for vip_name, vip_id in self.vip_tags.iteritems()
        }
        self.importants = set(
            t.name.lower()
            for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_tag_novelties()
        self.thesaurus_ids = self.__load_tag_novelties(tid=True)
        self.tag_types = self.__load_tag_types()
        self.trusted_sources = dicts.get_known_company_source()
        self.replacements = {
            dbutil.get_tag_name(self.db, r['source']):
            [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']]
            for r in self.mongo.keywords.replacement.find()
        }
        self.junk_terms = set(
            tag.name
            for tag in dbutil.get_tags_by_type(self.db, typeset=([11001])))

        self.similarity_threshold = 0.4
        self.textrank_window_size = 2
        self.textrank_threshold = 0
        self.source_tag_default_weight = 2
        self.vip_lower = 0.3
        self.important_threshold = 0.2
        self.important_max_count = 5

        print 'model inited'

    def __load_trained_clfs(self):

        model_dir = os.path.join(
            os.path.split(os.path.realpath(__file__))[0], 'models')
        clfs = {}
        for model_file in os.listdir(model_dir):
            if model_file.endswith('.model'):
                tid = model_file.split('.')[0]
                if not isinstance(tid, int):
                    continue
                clfs[dbutil.get_tag_name(self.db, int(tid))] = joblib.load(
                    os.path.join(model_dir, model_file))
        return clfs

    def __load_tag_novelties(self, tid=False):

        if not tid:
            return {
                tag.name: (tag.novelty or 1)
                for tag in dbutil.get_tags_by_type(self.db)
            }
        else:
            return {
                tag.id: (tag.novelty or 1)
                for tag in dbutil.get_tags_by_type(self.db)
            }

    def __load_tag_types(self):

        return {
            tag.name: (tag.type or 0)
            for tag in dbutil.get_tags_by_type(self.db)
        }

    def __extract_source_tag(self, cid):

        tags = dbutil.get_source_company_tags(self.db, cid,
                                              self.trusted_sources)
        if tags:
            return set(
                chain(*[
                    dbutil.analyze_source_tag(
                        self.db, tname, self.replacements) for tname in tags
                    if tname and tname.strip()
                ]))
        return set([])

    def __extract_vecrank(self, candidates, candidates_important,
                          candidates_vips, topn):

        graph = UndirectWeightedGraph()
        weights = collections.defaultdict(int)
        proper_hyponym = dict.fromkeys(
            set(
                chain(*[
                    self.hyponym.get(dbutil.get_tag_name(self.db, cv))
                    for cv in candidates_vips.iterkeys()
                ])), 2)
        for i in xrange(len(candidates)):
            for j in xrange(i + 1, i + self.textrank_window_size):
                if j >= len(candidates):
                    break
                weights[(candidates[i], candidates[j])] += 1
            if candidates[i] not in self.w2v:
                continue
            for word, weight in candidates_important.items():
                if word == candidates[i] or word not in self.w2v:
                    continue
                similarity = self.w2v.similarity(candidates[i], word)
                if similarity > self.similarity_threshold:
                    weights[(candidates[i], word)] += similarity * weight
        for terms, weight in weights.iteritems():
            graph.add_edge(terms[0], terms[1], weight)
        nodes_rank = graph.rank(self.thesaurus, proper_hyponym)
        topn = min(topn, len(candidates))
        start = 0
        for tag, weight in sorted(nodes_rank.items(),
                                  key=lambda x: -x[1])[:topn]:
            if tag in self.junk_terms:
                continue
            if start < 2:
                yield tag, round(weight, 2)
            elif weight >= self.textrank_threshold:
                yield tag, round(weight, 2)
            start += 1

    def extract_vip(self, cid):

        desc = ' '.join(
            self.wfilter(
                self.seg.cut4search(self.feeder.feed_string(cid, 'with_tag'))))
        if not desc:
            return {}
        classifier_vips = [
            (int(tag.replace(u'__label__', '')), weight)
            for (tag,
                 weight) in self.vip_classifier.predict_proba([desc], 2)[0]
            if weight > self.vip_lower
        ]
        classifier_vips.sort(key=lambda x: -x[1])
        # if 2 candidate vip label, check whether their probability is comparable
        if len(classifier_vips
               ) == 2 and classifier_vips[0][1] > classifier_vips[1][1] * 2:
            return {classifier_vips[0][0]: classifier_vips[0][1]}
        return dict(classifier_vips)

    def __extract_important(self, contents, candidates):

        # support assginment
        supports = deepcopy(candidates)
        for word in contents:
            if word not in self.w2v:
                continue
            for candidate in candidates.keys():
                if candidate not in self.w2v:
                    continue
                similarity = self.w2v.similarity(candidate, word)
                if similarity > self.similarity_threshold:
                    supports[candidate] = supports.get(candidate,
                                                       0) + similarity
        # support selection
        results = {}
        candi_size, content_size = len(candidates), len(''.join(candidates))
        for candidate, weight in supports.iteritems():
            if candi_size >= 2 and weight < content_size / 20:
                continue
            results[candidate] = weight * self.thesaurus.get(candidate, 1)
        if len(results) == 0:
            return results
        # normalization
        normalizer = max(results.values())
        for k, v in results.items():
            results[k] = round(v / normalizer, 2)
        # narrow down results size
        if len(results) < 4:
            pass
        else:
            results = dict(
                filter(lambda x: x[1] > self.important_threshold,
                       results.iteritems()))
            if len(results) > self.important_max_count:
                size = min(
                    10,
                    max(int(ceil(len(results) / 2.0)),
                        self.important_max_count))
                results = dict(
                    sorted(results.iteritems(), key=lambda x: -x[1])[:size])
        return results

    def __extract_textrank(self, candidates, topn=15):
        """
        weighted textrank, weights use tags' novelties
        """
        graph = UndirectWeightedGraph()
        weights = collections.defaultdict(int)
        for i in xrange(len(candidates)):
            for j in xrange(i + 1, i + self.textrank_window_size):
                if j >= len(candidates):
                    break
                weights[(candidates[i], candidates[j])] += 1
        for terms, weight in weights.iteritems():
            graph.add_edge(terms[0], terms[1], weight)
        nodes_rank = graph.rank(self.thesaurus)
        index = min(topn, len(candidates))
        start = 0
        for tag, weight in sorted(nodes_rank.items(),
                                  key=lambda x: -x[1])[:index]:
            if tag in self.junk_terms:
                continue
            if start < 2:
                yield tag, round(weight, 2)
            elif weight >= self.textrank_threshold:
                yield tag, round(weight, 2)
            start += 1

    def __prepare_tag_contents(self, cid):

        # prepare contents
        contents = list(self.feeder.feed(cid, quanlity='medium'))
        candidates = []
        for content, _ in contents:
            candidates.extend([x[0] for x in self.tagger.tag(content)])
        candidates = self.wfilter(candidates)
        source_tags = self.__extract_source_tag(cid)
        candidates_important = {}
        for content, weight in contents:
            for tag in [
                    x[0] for x in self.tagger.tag(content)
                    if x[1] == 'itag' or x[0] in self.importants
            ]:
                candidates_important[tag] = candidates_important.get(
                    tag, 0) + weight
        for tag in source_tags:
            candidates_important[tag] = candidates_important.get(
                tag, 0) + self.source_tag_default_weight

        return source_tags, candidates, candidates_important

    def __normalize_replacement(self, tags):

        if type(tags) is dict:
            normalized_tags = {}
            for tag, weight in tags.items():
                if tag in self.replacements:
                    for replacement in self.replacements.get(tag):
                        normalized_tags[replacement] = weight
                else:
                    normalized_tags[tag] = weight
        else:
            normalized_tags = []
            for tag in tags:
                if tag in self.replacements:
                    for replacement in self.replacements.get(tag):
                        normalized_tags.append(replacement)
                else:
                    normalized_tags.append(tag)
        return normalized_tags

    def __normalize(self, d):

        if not d:
            return d
        normalizer = max(d.values()) + 1.0
        for tag, weight in d.items():
            type_promotion = {
                11011: 1,
                11013: 1.5,
                11012: 2.5
            }.get(self.tag_types.get(tag, 0), 0)
            d[tag] = round(weight / normalizer, 2) + type_promotion
        return d

    def merge(self, d1, d2, weight=0):

        # weight is a bonus weight
        for k, v in d2.iteritems():
            d1[k] = d1.get(k, 0) + v + weight
        return d1

    def extract(self, cid, topn=15):

        # prepare contents
        source_tags, candidates, candidates_important = self.__prepare_tag_contents(
            cid)
        candidates_vips = self.extract_vip(cid)

        # generate results
        results = dict(
            self.__extract_vecrank(candidates, candidates_important,
                                   candidates_vips, topn))
        results = self.merge(
            results, {
                dbutil.get_tag_name(self.db, tid): w
                for tid, w in candidates_vips.iteritems()
            })
        # results = self.merge(results, self.__extract_important(candidates, candidates_important), 1)
        # results = self.merge(results, dict(self.__extract_textrank(candidates, topn)))
        results = self.__normalize(results)
        results = self.__normalize_replacement(results)
        return results

    def extract_from_text(self, text):

        candidates = []
        for content, _ in text.iteritems():
            candidates.extend([x[0] for x in self.tagger.tag(content)])
        candidates = self.wfilter(candidates)
        candidates_important = {}
        for content, weight in text.iteritems():
            for tag in [
                    x[0] for x in self.tagger.tag(content)
                    if x[1] == 'itag' or x[0] in self.importants
            ]:
                candidates_important[tag] = candidates_important.get(
                    tag, 0) + weight
        desc = ' '.join(
            self.wfilter(self.seg.cut4search(' '.join(text.keys()))))
        candidates_vips = {
            int(tag.replace(u'__label__', '')): weight
            for (tag,
                 weight) in self.vip_classifier.predict_proba([desc], 3)[0]
            if weight > self.vip_lower
        }
        results = {}
        results = self.merge(
            results, self.__extract_important(candidates,
                                              candidates_important), 1)
        results = self.merge(results,
                             dict(self.__extract_textrank(candidates, 10)))
        # results = dict(self.__extract_vecrank(candidates, candidates_important, candidates_vips, 10))
        results = self.merge(
            results, {
                dbutil.get_tag_name(self.db, tid): w
                for tid, w in candidates_vips.iteritems()
            })
        results = self.__normalize(results)
        results = self.__normalize_replacement(results)
        deducts = self.__deduct_2nd(results)
        if len(deducts) < 3:
            results = self.merge(results, deducts)
        return results

    def __deduct_2nd(self, tags):

        deduct = []
        tags = [(dbutil.get_tag_id(self.db, t)[0], t) for t in tags.keys()]
        for (tid, tag) in tags:
            if self.tag_types.get(tag, 0) == 11013:
                t1s = dbutil.get_hypernym_tags(self.db, tid, 1)
                for t1 in set(t1s) & set([t[0] for t in tags]):
                    t2s = set(dbutil.get_hyponym_tags(self.db, t1, 2)) & set(
                        dbutil.get_hypernym_tags(self.db, tid, 2))
                    for t2 in t2s:
                        if t2 not in set([t[0] for t in tags]):
                            deduct.append(t2)
        return {dbutil.get_tag_name(self.db, t2): 2.49 for t2 in deduct}