class NewsFeeder(object): def __init__(self): self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() def feed(self, record, granularity='default'): global logger_feeder try: contents = self.wfilter( self.seg.cut(record['title'].replace('\n', ' '))) if record.get('original_tags', []) and isinstance( record.get('original_tags', []), list): contents.extend(record.get('original_tags', [])) if granularity == 'fine': contents.extend( self.wfilter( self.seg.cut4search(' '.join([ piece['content'].replace('\n', ' ') for piece in record['contents'] ])))) else: contents.extend( self.wfilter( self.seg.cut(' '.join([ piece['content'].replace('\n', ' ') for piece in record['contents'] ])))) return contents except Exception, e: logger_feeder.error('Fail to feed, %s, %s' % (record['_id'], e)) return []
class News(object): def __init__(self): self.mongo = dbcon.connect_mongo() self.db = dbcon.connect_torndb() self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() def __iter__(self): for news in self.mongo.article.news.find({'processStatus': 1}).sort('_id', DESCENDING).limit(200000): try: content = [] content.extend(self.wfilter(self.seg.cut4search(news.get('title', '')))) for piece in news.get('contents', []): content.extend(self.wfilter(self.seg.cut(piece.get('content', '')))) if len(content) > 10: yield content except Exception, e: continue for c in self.db.query('select description from company where verify="Y" and modifyTime>"2016-06-01";'): try: if len(c.description) > 10: yield self.wfilter(self.seg.cut4search(c.description)) except: continue
def load_data_l1_sources(): with codecs.open(os.path.join( os.path.split(os.path.realpath(__file__))[0], 'config/sector_name'), encoding='utf-8') as f: config = { int(line.split('#')[0].strip()): line.split('#')[1].split(',') for line in f if line.strip() } db = dbcon.connect_torndb() seg = Segmenter() trainx, trainy = [], [] for sid, names in config.iteritems(): for name in names: ids = db.query( 'select distinct id from source_company where field=%s;', name) for scid in ids: content = db.query( 'select content from source_context ' 'where sourceCompanyId=%s and char_length(content)>20 and type=30020 ' 'and confidence>0.7 ' 'order by confidence desc;', scid.id) if len(content) > 0: trainx.append(' '.join(seg.cut( content[0].content.strip()))) trainy.append(sid) db.close() print set(trainy) return trainx, np.array(trainy)
def feed_doc(tag=u'金融'): mongo = dbcon.connect_mongo() segmenter = Segmenter(tag=True) wfilter = get_default_filter() for record in mongo.article.news.find({'tags': tag}): yield chain(*[ wfilter(segmenter.cut(piece['content'].strip())) for piece in record['contents'] if piece['content'].strip() ])
def load_data(): db = dbcon.connect_torndb() seg = Segmenter() X, Y = [], [] for item in db.query('select * from source_context;'): X.append(' '.join(list(seg.cut(item.content))).strip()) Y.append(item.type == 30010) db.close() return X, Y
def load_ruled_news(): global labels seg = Segmenter(tag=True) wfilter = word_filter.get_default_filter() trainx, trainy = [], [] mongo = dbcon.connect_mongo() for record in mongo.article.news.find({ '$and': [{ 'category': { '$ne': None } }, { 'category': { '$ne': 60199 } }, { 'category': { '$ne': 60106 } }], 'type': 60001, 'category_confidence': None }).limit(10000): contents = wfilter(seg.cut(record['title'])) contents.extend( wfilter( seg.cut(' '.join( [piece['content'] for piece in record['contents']])))) if len(contents) > 10: trainx.append(' '.join(contents)) trainy.append(int(labels.get(record['category']))) mongo.close() return np.array(trainx), np.array(trainy)
class Companies(object): def __init__(self): self.data_dir = os.path.join( os.path.split(os.path.realpath(__file__))[0], '../data/tsb/company/ltp_cut') self.segmenter = Segmenter() self.mapping_id2in = {} self.mapping_in2id = {} self.max_id = 0 self.default_filter = word_filter.get_default_filter() def __iter__(self): global description_len_threshold, complete_threshold # db = torndb.Connection(**nlpconfig.get_mysql_config_tshbao()) db = dbcon.connect_torndb() index = 0 for result in iter(dbutil.get_all_company(db)): cid, desc = result.get('id'), result.get('context', '') score = dbutil.get_company_score(db, cid) if not (score and score > complete_threshold): continue if int(cid) > self.max_id: self.max_id = int(cid) if not os.path.exists(os.path.join(self.data_dir, str(cid))): words = list(self.segmenter.cut(desc)) else: words = [ line.split('\t')[0].strip() for line in codecs.open(os.path.join( self.data_dir, str(cid)), encoding='utf-8') if line.strip() ] if not words: continue words = self.default_filter(words) if len(words) < description_len_threshold: continue self.mapping_id2in[cid] = index self.mapping_in2id[index] = cid index += 1 yield [word.lower() for word in words] db.close() def get_mapping_id2in(self): return self.mapping_id2in def get_mapping_in2id(self): return self.mapping_in2id
class SourceCompany(object): def __init__(self, size_limit=None): self.db = dbcon.connect_torndb() self.seg = Segmenter(tag=True) self.wfilter = word_filter.get_default_filter() self.size_limit = size_limit def __iter__(self): if not self.size_limit: sql2use = 'select * from source_company where active is null or active="Y";' else: sql2use = 'select * from source_company where active is null or active="Y" ' \ 'order by rand() limit %s;' % self.size_limit for result in self.db.iter(sql2use): content = [] if result.brief and result.brief.strip(): content.extend(self.wfilter(self.seg.cut(result.brief))) if result.description and result.description.strip(): content.extend(self.wfilter(self.seg.cut(result.description.strip()))) if len(content) > 10: yield content
class Companies(object): def __init__(self): self.segmenter = Segmenter() self.feeder = Feeder() self.mapping_id2in = {} self.mapping_in2id = {} self.max_id = 0 self.default_filter = word_filter.get_default_filter() def __iter__(self): global description_len_threshold, complete_threshold db = dbcon.connect_torndb() index = 0 for cid in iter(dbutil.get_all_company_id(db)): contents = self.feeder.feed_string(cid) score = dbutil.get_company_score(db, cid) if not (score and score > complete_threshold): continue if int(cid) > self.max_id: self.max_id = int(cid) words = list(self.segmenter.cut(contents)) if not words: continue words = self.default_filter(words) if len(words) < description_len_threshold: continue self.mapping_id2in[cid] = index self.mapping_in2id[index] = cid index += 1 yield [word.lower() for word in words] db.close() def get_mapping_id2in(self): return self.mapping_id2in def get_mapping_in2id(self): return self.mapping_in2id
def load_data_l1(): db = dbcon.connect_torndb() seg = Segmenter() # tfidf = TfIdfExtractor() trainx, trainy = [], [] resutls = db.query( 'select company_sector.companyId, company_sector.sectorId from company_sector, sector ' 'where company_sector.verify="Y" and sector.id=company_sector.sectorId and sector.level=1 ' ) # 'and sector.id not in (6, 9, 10, 12, 13, 15, 16, 17, 18, 19, 999);') for result in resutls: desc = db.get('select description from company where id=%s', result.companyId) sid = result.sectorId if desc and desc.description.strip(): # trainx.append(desc.strip()) trainx.append(' '.join(seg.cut(desc.description.strip()))) trainy.append(int(sid)) # trainx, trainy = tfidf.train(trainx, trainy) db.close() return trainx, np.array(trainy)
class TfIdfExtractor(FeatureExtractor): def __init__(self, opt=None): if not isinstance(opt, dict): opt = {} if opt.get('segmenter'): self.seg = opt.get('segmenter') else: self.seg = Segmenter() self.vectorizer = TfidfVectorizer( sublinear_tf=True, stop_words=stopword.get_standard_stopwords(), max_df=opt.get('max_df', 0.5), min_df=opt.get('min_df', 50), max_features=5000) self.selector = SelectKBest(chi2, k=opt.get('topk', 'all')) def train(self, docs, labels, seged=False): trainset = self.vectorizer.fit_transform(self.iter_docs(docs, seged)) # print len(self.vectorizer.get_feature_names()) # trainset = self.selector.fit_transform(trainset, labels) return trainset, labels def transform(self, docs, seged=False): return self.vectorizer.transform(self.iter_docs(docs, seged)) # return self.selector.transform(self.vectorizer.transform(self.iter_docs(docs, seged))) def iter_docs(self, docs, seged): for doc in docs: if not seged: yield ' '.join(self.seg.cut(doc)) else: yield doc
def dump_thesaurus(theme='source', topn=1000): db = dbcon.connect_torndb() seg = Segmenter() stopwords = stopword.get_standard_stopwords() tags = set(x.name for x in db.query('select name from tag where type>11001;')) vocab = {} if theme == 'source': query = 'select * from source_company where (active is null or active="Y");' else: query = 'select * from source_company where (active is null or active="Y");' for index, item in enumerate(db.iter(query)): for word in set(filter(lambda x: x not in stopwords and len(x) > 1 and not x.isnumeric() and x not in tags and x.strip(), seg.cut(item.description))): vocab[word] = vocab.get(word, 0) + 1 if index % 10000 == 0: low = [x[0] for x in vocab.iteritems() if x[1] < 20] for lowword in low: vocab.pop(lowword) print index, 'processed, size of vocab', len(vocab) db.close() vocab = sorted(vocab.iteritems(), key=lambda x: x[1], reverse=True)[:topn] with codecs.open(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'thesaurus/%s.%s.lowidf' % (theme, topn)), 'w', 'utf-8') as fo: fo.write('\n'.join([x[0] for x in vocab]))
reload(sys) import codecs import torndb from common.zhtools.segment import Segmenter from common.classifier.field import FieldClassifier if __name__ == '__main__': sql = 'select dealId,dealname,dealdesc from deal where joinDemoDay=2;' db = torndb.Connection('localhost:3306', 'demoday', 'root', '') clf = FieldClassifier(model='lr') seg = Segmenter() # clf.train() fo = codecs.open('tmp', 'w', 'utf-8') for rid, result in enumerate(db.query(sql)): did, doc = result.dealId, result.dealdesc try: label = clf.naive_classify(seg.cut(doc)) if label: print did, label fo.write('%s#%s\n' % (did, label[0])) except Exception, e: print did, 'fail' print e # if rid > 40: # break fo.close() db.close()
(record['_id'], str(cids))) life_circle_linker -= 1 except Exception, e: logger_news_pip.exception('mentioned failed, %s, %s' % (record['_id'], e)) mongo.article.news.update({'_id': record['_id']}, {'$set': { 'processStatus': -2 }}) continue # process category try: if (record.get('category', None) is None) and (record.get( 'type', 0) == 60001): contents = wfilter(seg.cut(record['title'])) contents.extend( wfilter( seg.cut(' '.join([ piece['content'] for piece in record['contents'] ])))) contents = np.array( list( vocab_processor.fit_transform( np.array([' '.join(contents)])))) label = clf.predict(contents)[0] prob = clf.predict_proba(contents)[0][label] category = labels_reverse.get(label) if prob < 0.7: category = 60199
class DocumentsSimilarity(object): """ tfidf model based document similarity """ def __init__(self): self.life_period = 1000 self.num_candidates = 800 self.min_similarity_threshold = 0.05 self.establish_discount = 0.75 self.dictionary = self.get_dict() self.id2in, self.in2id, self.corpus, self.max_id = self.get_corpus( self.dictionary) self.model, self.simi = self.train_model() self.segmenter = Segmenter() self.filter = Filter() self.feeder = Feeder() self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() def train_model(self): global cach_dir if not os.path.exists(cach_dir): os.mkdir(cach_dir) tfidf = models.TfidfModel(self.corpus) index = similarities.MatrixSimilarity(tfidf[self.corpus], num_best=self.num_candidates) return tfidf, index @classmethod def get_corpus(cls, dictionary): global logger_nlp, cach_dir companies = CompaniesVector(dictionary) fname = os.path.join( cach_dir, '%s.%s.corpus' % (datetime.datetime.now().strftime('%Y%m%d'), randint(0, 3600))) corpora.MmCorpus.serialize(fname, companies) logger_nlp.info('Corpus serialized') return companies.get_mapping_id2in(), companies.get_mapping_in2id( ), corpora.MmCorpus(fname), companies.max_id @classmethod def get_dict(cls): global stopwords, df_threshold_lower, df_threshold_upper, logger_nlp, cach_dir dates = datetime.datetime.now().strftime('%Y%m%d') if os.path.exists( os.path.join(cach_dir, '%s.%s.dict' % (dates, randint(0, 3600)))): try: dictionary = corpora.Dictionary.load( os.path.join(cach_dir, '%s.%s.dict' % (dates, randint(0, 3600)))) logger_nlp.info('Found dictionary file, loaded') return dictionary except: logger_nlp.error( 'Found dictionary file, fail to load, try to rebuild') pass companies = Companies() dictionary = corpora.Dictionary(company for company in companies) stop_ids = [ dictionary.token2id[word] for word in stopwords if word in dictionary.token2id ] low_df = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq <= df_threshold_lower ] high_df = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq > df_threshold_upper ] dictionary.filter_tokens(stop_ids + low_df + high_df) dictionary.compactify() dictionary.save( os.path.join(cach_dir, '%s.%s.dict' % (dates, randint(0, 3600)))) logger_nlp.info('Dictionary constructed, size %s' % len(dictionary.token2id)) return dictionary def get_similar(self, cid): global simi_threshold, complete_threshold # pooling if cid in self.id2in: vec = self.model[self.corpus[self.id2in[cid]]] simis = sorted(self.simi[vec], key=lambda x: -x[1])[1:self.num_candidates] simis = map(lambda x: (self.in2id[x[0]], round(x[1], 2)), simis) else: simis = self.get_similar4new(cid) # discount establish = dbutil.get_company_establish_date(self.db, cid).year simis = [(cid2, weight * self.__discount_year(establish, cid2)) for (cid2, weight) in simis] # sort and filter simis = sorted(simis, key=lambda x: -x[1]) simis = filter( lambda x: dbutil.get_company_score(self.db, x[0]) > complete_threshold and x[1] > self.min_similarity_threshold, simis) # dump and exit self.mongo.comps.candidates.update({'company': cid}, { '$set': { 'candidates': simis, 'modifyTime': datetime.datetime.now() } }, True) return simis def get_similar4new(self, cid): global logger_nlp # reload the model when life period goes down to 0, which means, reload after processing 200 new companies if int(cid) > self.max_id: self.life_period -= 1 if self.life_period == 0: logger_nlp.info('Reload recommend program') self.__init__() content = self.feeder.feed_string(cid) words = self.filter.filtermany(self.segmenter.cut(content)) vec = self.model[self.dictionary.doc2bow(words, allow_update=True)] simis = sorted(self.simi[vec], key=lambda x: -x[1])[1:self.num_candidates] simis = map(lambda x: (self.in2id[x[0]], round(x[1], 2)), simis) return simis def __discount_year(self, establish, cid2): diff = abs( dbutil.get_company_establish_date(self.db, cid2).year - establish) return self.establish_discount if diff > 5 else 1 def dump_full(self): global logger_nlp db = dbcon.connect_torndb() for cid in iter(dbutil.get_all_company_id(db)): try: self.get_similar(cid) logger_nlp.info('%s processed' % cid) except Exception, e: logger_nlp.exception('%s failed, %s' % (cid, e)) db.close()
','.join(map(lambda x: str(x), labels.values()))) fo.write('@DATA \n') for i in xrange(len(y)): fo.write('%s,%s\n' % (','.join([str(item) for item in x[i]]), labels.get(y[i]))) def weighted_choice(choices): total = sum(w for c, w in choices) r = random.uniform(0, total) upto = 0 for c, w in choices: if upto + w > r: return c upto += w if __name__ == '__main__': print __file__ # upsample('template/fields.data') # scatter_sample('weka/field.train.arff') fc = FieldClassifier() s = Segmenter() c = u'通过贴图让用户简单地画漫画,并用漫画沟通、社交。网站上线1年,ipad端7月3日上线。IPAD版上线一周积累20万用户,第一周有11.000多幅漫画上传。' print fc.naive_classify(s.cut(c)) # fc.build_labeled_corpus() # fc.train('template/fields.1.data')
class Feeder(object): def __init__(self): self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.non_trusted_discount = 0.5 self.brief_promote = 1.5 self.trusted_sources = dicts.get_known_company_source() self.wfilter = word_filter.get_default_filter() self.seg = Segmenter(tag=True) def feed(self, cid, mode='default', quanlity='low'): feeds = { 'default': self.__feed_default, 'with_tag': self.__feed_with_tag }.get(mode, 'default')(cid) feeds = list(feeds) if quanlity == 'medium': ave = min(mean([feed[1] for feed in feeds]), 2) return filter(lambda x: x[1] >= ave, feeds) if quanlity == 'low': return feeds def feed_string(self, cid, mode='default'): feeds = list(self.feed(cid, mode, 'medium')) return ' '.join([feed[0].strip() for feed in feeds]) def feed_seged(self, cid, feed_mode='default'): return self.wfilter(self.seg.cut(self.feed_string(cid, feed_mode))) def feed_seged_fine(self, cid, feed_mode='default'): return self.wfilter( self.seg.cut4search(self.feed_string(cid, feed_mode))) def feed_relevant_string(self, cid): pass def __feed_with_tag(self, cid): for feed in self.__feed_default(cid): yield feed for source_tag in dbutil.get_source_company_tags( self.db, cid, self.trusted_sources): if source_tag and source_tag.strip(): yield source_tag, 2 def __feed_default(self, cid): cscore = dbutil.get_company_score(self.db, cid, 37010) # company info info = dbutil.get_company_info(self.db, cid) score = 1.5 if cscore > 0.5 else 1 if info.verify and info.verify == 'Y': score += 1 if info.brief and info.brief.strip(): yield self.__preprocess(info.brief.strip()), score if info.description and info.description.strip(): yield self.__preprocess(info.description.strip()), score # source company for info in dbutil.get_source_company_infos(self.db, cid): discount = self.non_trusted_discount if info.source not in self.trusted_sources else 1 if info.brief and info.brief.strip(): yield self.__preprocess( info.brief.strip()), discount * self.brief_promote if info.description and info.description.strip(): yield self.__preprocess(info.description.strip()), discount # iOS info = dbutil.get_recommend_artifact(self.db, cid) if info and info.description and info.description.strip(): ascore = 1 if (info.verify and info.verify == 'Y') else 0.5 yield self.__preprocess(info.description.strip()), ascore def __preprocess(self, content): # clean and narrow down candidates # 繁转简 content = hants.translate(unicode(content)) # 转小写 content = content.lower() return content.strip()
class PositionClassifier(object): def __init__(self): self.segmenter = Segmenter(cut_all=True) self.first_mapping = { 1: u'技术', 2: u'产品', 3: u'设计', 4: u'运营', 5: u'市场', 6: u'职能' } self.first_positions = dict.fromkeys(self.first_mapping.keys()) self.train_first_positions() def train_first_positions(self): self.first_positions[1] = set([ u'工程师', u'技术', u'java', u'python', u'php', u'c++', u'c', u'android', u'ios', u'测试', u'web', u'前端', u'数据库', u'ruby', u'perl', u'node.js', u'c#', u'go', u'html5', u'flash', u'javascript', u'u3d', u'运维', u'网络', u'安全', u'数据仓库', u'dba', u'mysql', u'oracle', u'sqlserver', u'sql', u'硬件', u'嵌入式', u'驱动', u'材料', u'开发' ]) self.first_positions[2] = set([ u'产品', u'产品经理', u'策划', ]) self.first_positions[3] = set([ u'设计', u'设计师', u'游戏', u'ui', u'ue', ]) self.first_positions[4] = set([ u'运营', u'coo', u'编辑', u'主编', u'文案', u'售前', u'售后', u'客服', ]) self.first_positions[5] = set([ u'市场', u'销售', u'seo', u'sem', u'商务', u'客户', u'bd', u'公关', u'采购', u'物流', u'仓储', u'广告', u'媒介', u'招商', u'推广' ]) self.first_positions[6] = set([ u'人事', u'hr', u'行政', u'培训', u'绩效', u'前台', u'总助', u'秘书', u'文秘', u'财务', u'会计', u'出纳', u'税务', u'审计', u'hrm', u'hrd', u'财务', u'法务', u'律师', u'专利', u'招聘' ]) def get_first_positions(self): return self.first_positions.keys() def classify_first(self, position): position = set(map(lambda x: x.lower(), self.segmenter.cut(position))) return sorted([(k, len(position & v)) for k, v in self.first_positions.items()], key=lambda x: -x[1])[0][0] def get_first_name(self, key): return self.first_mapping.get(key)