def __init__(self, sector_setting='default'): self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() self.vips = {} if sector_setting == 'new': sector_setting_file = os.path.join( os.path.split(os.path.realpath(__file__))[0], '../common/dict/vip.cluster.frozen') elif sector_setting == 'default': sector_setting_file = os.path.join( os.path.split(os.path.realpath(__file__))[0], '../common/dict/sector.cluster.frozen') else: sector_setting_file = os.path.join( os.path.split(os.path.realpath(__file__))[0], '../common/dict/sector.cluster.frozen') db = dbcon.connect_torndb() for line in codecs.open(sector_setting_file, encoding='utf-8'): vip, tags = line.split('#')[0].lower(), line.split( '#')[1].strip().split(',') for tag in tags: try: self.vips[tag.lower()] = ( vip, dbutil.get_tag_novelty(db, tag, name=True) / len(tags)) except Exception, e: print tag, e
def __init__(self): self.segmenter = Segmenter() self.feeder = Feeder() self.mapping_id2in = {} self.mapping_in2id = {} self.max_id = 0 self.default_filter = word_filter.get_default_filter()
def __init__(self): self.data_dir = os.path.join( os.path.split(os.path.realpath(__file__))[0], '../data/tsb/company/ltp_cut') self.segmenter = Segmenter() self.mapping_id2in = {} self.mapping_in2id = {} self.max_id = 0 self.default_filter = word_filter.get_default_filter()
def feed_doc(tag=u'金融'): mongo = dbcon.connect_mongo() segmenter = Segmenter(tag=True) wfilter = get_default_filter() for record in mongo.article.news.find({'tags': tag}): yield chain(*[ wfilter(segmenter.cut(piece['content'].strip())) for piece in record['contents'] if piece['content'].strip() ])
def __init__(self): self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.non_trusted_discount = 0.5 self.brief_promote = 1.5 self.trusted_sources = dicts.get_known_company_source() self.wfilter = word_filter.get_default_filter() self.seg = Segmenter(tag=True)
def feed_doc_s(sid): mongo = dbcon.connect_mongo() tagger = Tagger(tags=True) wfilter = get_default_filter() for record in mongo.article.news.find({'sectors': sid}): yield chain(*[ wfilter([ w[0] for w in tagger.tag(piece['content'].strip()) if w[1] in ('tag', 'itag') ]) for piece in record['contents'] if piece['content'].strip() ])
def __init__(self): global word2vec_model, viptag_model_20171221 self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.feeder = Feeder() self.tagger = Tagger(itags=True) self.seg = Segmenter(tags=True) self.wfilter = word_filter.get_default_filter() self.w2v = Word2Vec.load(word2vec_model) self.trained_tag_clfs = self.__load_trained_clfs() self.vip_classifier = fasttext.load_model(viptag_model_20171221) self.yellows = dbutil.get_yellow_tags(self.db) self.vip_tags = { t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1) } self.hyponym = { vip_name: set([ dbutil.get_tag_name(self.db, tid) for tid in dbutil.get_hyponym_tags(self.db, vip_id) ]) for vip_name, vip_id in self.vip_tags.iteritems() } self.importants = set( t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013])) self.thesaurus = self.__load_tag_novelties() self.thesaurus_ids = self.__load_tag_novelties(tid=True) self.tag_types = self.__load_tag_types() self.trusted_sources = dicts.get_known_company_source() self.replacements = { dbutil.get_tag_name(self.db, r['source']): [dbutil.get_tag_name(self.db, rtid) for rtid in r['replacement']] for r in self.mongo.keywords.replacement.find() } self.junk_terms = set( tag.name for tag in dbutil.get_tags_by_type(self.db, typeset=([11001]))) self.similarity_threshold = 0.4 self.textrank_window_size = 2 self.textrank_threshold = 0 self.source_tag_default_weight = 2 self.vip_lower = 0.3 self.important_threshold = 0.2 self.important_max_count = 5 print 'model inited'
def __init__(self): global word2vec_model self.mongo = dbcon.connect_mongo() self.w2v = Word2Vec.load(word2vec_model) self.similarity_threshold = 0.4 self.important_lower = 0.1 self.important_threshold = 0.2 self.important_max_num = 5 self.tagger = Tagger(tags=True) self.wfilter = word_filter.get_default_filter()
def __init__(self): global viptag_model, logger_news_pip self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() self.feeder = NewsFeeder() self.viptag_clf = fasttext.load_model(viptag_model) self.life_circle_linker = 100 self.life_circle_linker_max = 100 self.linker = CompanyLinker() logger_news_pip.info('Model inited')
def __init__(self): global word2vec_model, viptag_model_20171221, viptag_model_traditional, logger_tag logger_tag.info('Extractor model initing') self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.feeder = Feeder() self.tagger = Tagger(itags=True) self.seg = Segmenter(itags=True) self.wfilter = word_filter.get_default_filter() self.gang = GangTag() self.w2v = Word2Vec.load(word2vec_model) self.similarity_threshold = 0.4 self.chain_simi_threshold = 0.25 self.vip_tags = {t.name: t.id for t in dbutil.get_sectored_tags(self.db, 1)} self.vip_classifier = fasttext.load_model(viptag_model_20171221) self.traditional_classifier = fasttext.load_model(viptag_model_traditional) self.trained_tag_clfs = self.__load_trained_clfs() self.important_lower = 0.1 self.important_threshold = 0.2 self.relevant_threshold = 0.4 self.vip_lower = 0.3 self.vip_threshold = 0.25 self.important_max_num = 5 self.max_contents_length = 20 self.yellows = dbutil.get_yellow_tags(self.db) self.importants = set(t.name.lower() for t in dbutil.get_tags_by_type(self.db, [11011, 11013])) self.thesaurus = self.__load_weighted_tags() self.thesaurus_ids = self.__load_weighted_tags(tid=True) self.junk_terms = self.__load_junk_tags() self.replacements = {r['source']: r['replacement'] for r in self.mongo.keywords.replacement.find()} self.trusted_sources = dicts.get_known_company_source() self.general_tagger = GeneralTagger() logger_tag.info('Extractor model inited')
def load_ruled_news(): global labels seg = Segmenter(tag=True) wfilter = word_filter.get_default_filter() trainx, trainy = [], [] mongo = dbcon.connect_mongo() for record in mongo.article.news.find({ '$and': [{ 'category': { '$ne': None } }, { 'category': { '$ne': 60199 } }, { 'category': { '$ne': 60106 } }], 'type': 60001, 'category_confidence': None }).limit(10000): contents = wfilter(seg.cut(record['title'])) contents.extend( wfilter( seg.cut(' '.join( [piece['content'] for piece in record['contents']])))) if len(contents) > 10: trainx.append(' '.join(contents)) trainy.append(int(labels.get(record['category']))) mongo.close() return np.array(trainx), np.array(trainy)
def __init__(self): self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter()
continue_training=True) clf.fit(trainx, trainy) # prepare for simple sector cvipc = ClusterVIPClassifier() # prepare for mentioned company life_circle_linker = 100 life_circle_linker_max = 100 linker = CompanyLinker() # prepare for connection mongo = dbcon.connect_mongo() db = dbcon.connect_torndb() seg = Segmenter(tag=True) wfilter = word_filter.get_default_filter() logger_news_pip.info('start to process pending news') while True: for record in list( mongo.article.news.find({ 'type': { '$in': [60001, 60002, 60003] }, 'processStatus': 0 }).sort('date', pymongo.DESCENDING)): if record.get('source', 0) == 13022: mongo.article.news.update({'_id': record['_id']},
def __init__(self): self.mongo = dbcon.connect_mongo() self.db = dbcon.connect_torndb() self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter()
def __init__(self, size_limit=None): self.db = dbcon.connect_torndb() self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() self.size_limit = size_limit