Ejemplo n.º 1
0
    def __init__(self, sector_setting='default'):

        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
        self.vips = {}

        if sector_setting == 'new':
            sector_setting_file = os.path.join(
                os.path.split(os.path.realpath(__file__))[0],
                '../common/dict/vip.cluster.frozen')
        elif sector_setting == 'default':
            sector_setting_file = os.path.join(
                os.path.split(os.path.realpath(__file__))[0],
                '../common/dict/sector.cluster.frozen')
        else:
            sector_setting_file = os.path.join(
                os.path.split(os.path.realpath(__file__))[0],
                '../common/dict/sector.cluster.frozen')

        db = dbcon.connect_torndb()
        for line in codecs.open(sector_setting_file, encoding='utf-8'):
            vip, tags = line.split('#')[0].lower(), line.split(
                '#')[1].strip().split(',')
            for tag in tags:
                try:
                    self.vips[tag.lower()] = (
                        vip,
                        dbutil.get_tag_novelty(db, tag, name=True) / len(tags))
                except Exception, e:
                    print tag, e
Ejemplo n.º 2
0
    def __init__(self):

        self.segmenter = Segmenter()
        self.feeder = Feeder()
        self.mapping_id2in = {}
        self.mapping_in2id = {}
        self.max_id = 0
        self.default_filter = word_filter.get_default_filter()
Ejemplo n.º 3
0
    def __init__(self):

        self.data_dir = os.path.join(
            os.path.split(os.path.realpath(__file__))[0],
            '../data/tsb/company/ltp_cut')
        self.segmenter = Segmenter()
        self.mapping_id2in = {}
        self.mapping_in2id = {}
        self.max_id = 0
        self.default_filter = word_filter.get_default_filter()
Ejemplo n.º 4
0
def feed_doc(tag=u'金融'):

    mongo = dbcon.connect_mongo()
    segmenter = Segmenter(tag=True)
    wfilter = get_default_filter()
    for record in mongo.article.news.find({'tags': tag}):
        yield chain(*[
            wfilter(segmenter.cut(piece['content'].strip()))
            for piece in record['contents'] if piece['content'].strip()
        ])
Ejemplo n.º 5
0
Archivo: feed.py Proyecto: yujiye/Codes
    def __init__(self):

        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.non_trusted_discount = 0.5
        self.brief_promote = 1.5
        self.trusted_sources = dicts.get_known_company_source()

        self.wfilter = word_filter.get_default_filter()
        self.seg = Segmenter(tag=True)
Ejemplo n.º 6
0
def feed_doc_s(sid):

    mongo = dbcon.connect_mongo()
    tagger = Tagger(tags=True)
    wfilter = get_default_filter()
    for record in mongo.article.news.find({'sectors': sid}):
        yield chain(*[
            wfilter([
                w[0] for w in tagger.tag(piece['content'].strip())
                if w[1] in ('tag', 'itag')
            ]) for piece in record['contents'] if piece['content'].strip()
        ])
Ejemplo n.º 7
0
    def __init__(self):

        global word2vec_model, viptag_model_20171221
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(tags=True)
        self.wfilter = word_filter.get_default_filter()

        self.w2v = Word2Vec.load(word2vec_model)
        self.trained_tag_clfs = self.__load_trained_clfs()
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.vip_tags = {
            t.name: t.id
            for t in dbutil.get_sectored_tags(self.db, 1)
        }
        self.hyponym = {
            vip_name: set([
                dbutil.get_tag_name(self.db, tid)
                for tid in dbutil.get_hyponym_tags(self.db, vip_id)
            ])
            for vip_name, vip_id in self.vip_tags.iteritems()
        }
        self.importants = set(
            t.name.lower()
            for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_tag_novelties()
        self.thesaurus_ids = self.__load_tag_novelties(tid=True)
        self.tag_types = self.__load_tag_types()
        self.trusted_sources = dicts.get_known_company_source()
        self.replacements = {
            dbutil.get_tag_name(self.db, r['source']):
            [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']]
            for r in self.mongo.keywords.replacement.find()
        }
        self.junk_terms = set(
            tag.name
            for tag in dbutil.get_tags_by_type(self.db, typeset=([11001])))

        self.similarity_threshold = 0.4
        self.textrank_window_size = 2
        self.textrank_threshold = 0
        self.source_tag_default_weight = 2
        self.vip_lower = 0.3
        self.important_threshold = 0.2
        self.important_max_count = 5

        print 'model inited'
Ejemplo n.º 8
0
Archivo: news.py Proyecto: yujiye/Codes
    def __init__(self):

        global word2vec_model

        self.mongo = dbcon.connect_mongo()

        self.w2v = Word2Vec.load(word2vec_model)
        self.similarity_threshold = 0.4

        self.important_lower = 0.1
        self.important_threshold = 0.2
        self.important_max_num = 5

        self.tagger = Tagger(tags=True)
        self.wfilter = word_filter.get_default_filter()
Ejemplo n.º 9
0
    def __init__(self):

        global viptag_model, logger_news_pip
        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
        self.feeder = NewsFeeder()

        self.viptag_clf = fasttext.load_model(viptag_model)

        self.life_circle_linker = 100
        self.life_circle_linker_max = 100
        self.linker = CompanyLinker()

        logger_news_pip.info('Model inited')
Ejemplo n.º 10
0
Archivo: key.py Proyecto: yujiye/Codes
    def __init__(self):

        global word2vec_model, viptag_model_20171221, viptag_model_traditional, logger_tag
        logger_tag.info('Extractor model initing')

        self.db = dbcon.connect_torndb()
        self.mongo = dbcon.connect_mongo()

        self.feeder = Feeder()
        self.tagger = Tagger(itags=True)
        self.seg = Segmenter(itags=True)
        self.wfilter = word_filter.get_default_filter()

        self.gang = GangTag()

        self.w2v = Word2Vec.load(word2vec_model)
        self.similarity_threshold = 0.4
        self.chain_simi_threshold = 0.25

        self.vip_tags = {t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1)}
        self.vip_classifier = fasttext.load_model(viptag_model_20171221)
        self.traditional_classifier = fasttext.load_model(viptag_model_traditional)
        self.trained_tag_clfs = self.__load_trained_clfs()

        self.important_lower = 0.1
        self.important_threshold = 0.2
        self.relevant_threshold = 0.4
        self.vip_lower = 0.3
        self.vip_threshold = 0.25
        self.important_max_num = 5
        self.max_contents_length = 20

        self.yellows = dbutil.get_yellow_tags(self.db)
        self.importants = set(t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013]))
        self.thesaurus = self.__load_weighted_tags()
        self.thesaurus_ids = self.__load_weighted_tags(tid=True)
        self.junk_terms = self.__load_junk_tags()
        self.replacements = {r['source']: r['replacement'] for r in self.mongo.keywords.replacement.find()}

        self.trusted_sources = dicts.get_known_company_source()

        self.general_tagger = GeneralTagger()

        logger_tag.info('Extractor model inited')
Ejemplo n.º 11
0
def load_ruled_news():

    global labels

    seg = Segmenter(tag=True)
    wfilter = word_filter.get_default_filter()
    trainx, trainy = [], []

    mongo = dbcon.connect_mongo()
    for record in mongo.article.news.find({
            '$and': [{
                'category': {
                    '$ne': None
                }
            }, {
                'category': {
                    '$ne': 60199
                }
            }, {
                'category': {
                    '$ne': 60106
                }
            }],
            'type':
            60001,
            'category_confidence':
            None
    }).limit(10000):
        contents = wfilter(seg.cut(record['title']))
        contents.extend(
            wfilter(
                seg.cut(' '.join(
                    [piece['content'] for piece in record['contents']]))))
        if len(contents) > 10:
            trainx.append(' '.join(contents))
            trainy.append(int(labels.get(record['category'])))
    mongo.close()

    return np.array(trainx), np.array(trainy)
Ejemplo n.º 12
0
Archivo: feed.py Proyecto: yujiye/Codes
    def __init__(self):

        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
Ejemplo n.º 13
0
                                    continue_training=True)
    clf.fit(trainx, trainy)

    # prepare for simple sector
    cvipc = ClusterVIPClassifier()

    # prepare for mentioned company
    life_circle_linker = 100
    life_circle_linker_max = 100
    linker = CompanyLinker()

    # prepare for connection
    mongo = dbcon.connect_mongo()
    db = dbcon.connect_torndb()
    seg = Segmenter(tag=True)
    wfilter = word_filter.get_default_filter()

    logger_news_pip.info('start to process pending news')

    while True:

        for record in list(
                mongo.article.news.find({
                    'type': {
                        '$in': [60001, 60002, 60003]
                    },
                    'processStatus': 0
                }).sort('date', pymongo.DESCENDING)):

            if record.get('source', 0) == 13022:
                mongo.article.news.update({'_id': record['_id']},
Ejemplo n.º 14
0
Archivo: w2v.py Proyecto: yujiye/Codes
    def __init__(self):

        self.mongo = dbcon.connect_mongo()
        self.db = dbcon.connect_torndb()
        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
Ejemplo n.º 15
0
Archivo: w2v.py Proyecto: yujiye/Codes
    def __init__(self, size_limit=None):

        self.db = dbcon.connect_torndb()
        self.seg = Segmenter(tag=True)
        self.wfilter = word_filter.get_default_filter()
        self.size_limit = size_limit