def dump_data(): global logger_nf mongo = dbcon.connect_mongo() db = dbcon.connect_torndb() nf = NewsFeeder() mapping = {} for line in codecs.open('files/sector.20171221.mapping', encoding='utf-8'): tag, relevants = line.strip().split('#')[0], line.strip().split( '#')[1].split(',') tag = dbutil.get_tag_id(db, tag)[0] for relevant in relevants: mapping[dbutil.get_tag_id(db, relevant)[0]] = tag sectors = { s.id: s.tagId for s in db.query('select * from sector where tagId is not null;') } ftrain = codecs.open('tmp/20171221.fine.train', 'w', 'utf-8') ftest = codecs.open('tmp/20171221.fine.test', 'w', 'utf-8') count = 0 for news in mongo.article.news.find({ 'processStatus': 1, 'modifyUser': { '$ne': None }, 'sectors': { '$ne': None } }): labels = filter( lambda x: x in mapping.keys(), [sectors.get(s, -1) for s in news.get('sectors', []) if s != 999]) labels.extend( [t for t in news.get('features', []) if t in mapping.keys()]) labels = set([mapping.get(t) for t in labels]) if not labels: continue labels = ['__label__%s' % t for t in labels] if len(labels) > 3: continue labels = ' '.join(labels) if not labels: continue contents = ' '.join(nf.feed(news, granularity='fine')).replace('\n', ' ') if not contents: continue if len(contents) < 50: continue count += 1 if count % 10000 == 0: logger_nf.info('Dumping file, %s done' % count) if randint(1, 5) == 1: ftest.write('%s %s\n' % (labels, contents)) else: ftrain.write('%s %s\n' % (labels, contents)) ftrain.close() ftest.close() logger_nf.info('All news dumped')
def comps(): global logger_comps query = json.loads(request.data).get('payload') logger_comps.info('Comps Query, %s' % query) cid, tag, start, size = query.get('company'), query.get( 'tag', 0), query.get('start', 0), query.get('size', 5) if tag == 0: comps_candidates = dbutil.get_company_comps(g.db, cid) logger_comps.info(comps_candidates) results = { 'company': { 'count': len(comps_candidates), 'data': map(lambda x: {'id': dbutil.get_company_code(g.db, x)}, comps_candidates)[start:start + size], 'tags': dbutil.prompt_tag_filter(g.db, comps_candidates) } } else: tag = dbutil.get_tag_id(g.db, tag)[0] comps_candidates = dbutil.get_filtered_company_comps(g.db, cid, tag) results = { 'company': { 'count': len(comps_candidates), 'data': map(lambda x: {'id': dbutil.get_company_code(g.db, x)}, comps_candidates)[start:start + size] } } return make_response(jsonify(results))
def extract_without_update(self, cid, topn=15): contents = list(self.feeder.feed(cid)) results = {} if len(contents) > self.max_contents_length: contents = sorted(contents, key=lambda x: -x[1])[:self.max_contents_length] # important tag results = self.merge(results, self.__extract_important(contents), 1) # regular tag results = self.merge(results, dict(self.__extract_textrank(contents, topn))) # verified tag results = self.merge(results, dict.fromkeys(dbutil.get_company_tags_verified(self.db, cid), 1)) # normalize results = self.__normalize(results) # update contents based tags results = self.__normalize_replacement(results) new_tags = self.update_vip_tags(cid, results, []) for tag, weight in sorted(results.items(), key=lambda x: -x[1])[:topn]: tid, active = dbutil.get_tag_id(self.db, tag) if tid in self.vip_tags.values(): continue if active: new_tags.append(tid) return new_tags
def load_blockchain(): db = dbcon.connect_torndb() db.execute('delete from tags_rel where id=447283;') t3s = [t.tag2Id for t in db.query('select tag2Id from tags_rel where tagId=175747 and type=54041;')] db.execute('delete from tags_rel where tagId=175747;') db.execute('update tag set type=11010, sectorType=null where id in %s;', t3s) for line in codecs.open('files/blockchain', encoding='utf-8'): tags = line.strip().split() if len(tags) == 1: t2id = dbutil.get_tag_id(db, tags[0])[0] db.execute('update tag set type=11013, sectorType=2 where name=%s;', tags[0]) dbutil.update_tags_rel(db, 175747, t2id, 1, 54041) if len(tags) == 2: t2id = dbutil.get_tag_id(db, tags[0])[0] t3id = dbutil.get_tag_id(db, tags[1])[0] db.execute('update tag set type=11013, sectorType=3 where name=%s;', tags[1]) dbutil.update_tags_rel(db, t2id, t3id, 1, 54041) db.execute('update tag set sectorType=1, type=11012 where id=175747;') db.execute('insert into sector (sectorName, active, level, tagid, createtime) ' 'values ("区块链", "Y", 1, 175747, now());')
def update_vip_tags(self, cid, support_tags, source_tags): vips = {} support_tag_ids = set(dbutil.get_tag_id(self.db, tag)[0] for tag in support_tags) support_vips = {self.vip_tags.get(tag): weight for tag, weight in support_tags.items() if tag in self.vip_tags.keys()} for support_vip, support_weight in support_vips.iteritems(): hyponyms = dbutil.get_hyponym_tags(self.db, support_vip) support_vips[support_vip] = support_weight + len(set(hyponyms) & support_tag_ids) source_vips = [self.vip_tags.get(tag) for tag in source_tags if tag in self.vip_tags.keys()] # desc = ' '.join(self.wfilter([x[0] for x in self.tagger.tag(self.feeder.feed_string(cid, 'with_tag'))])) desc = ' '.join(self.wfilter(self.seg.cut4search(self.feeder.feed_string(cid, 'with_tag')))) if not desc: desc = u'其他' # print desc classifier_vips = {int(tag.replace(u'__label__', '')): weight for (tag, weight) in self.vip_classifier.predict_proba([desc], 3)[0] if weight > self.vip_lower} traditional = self.traditional_classifier.predict_proba([desc], 1)[0][0] if source_vips: for rank, vip in enumerate(sorted([t for t in source_vips if t in classifier_vips], key=lambda x: support_vips.get(x, 0)+classifier_vips.get(x, 0), reverse=True)): vips[vip] = 2.999 - round(rank/10.0, 1) if traditional[0].replace('__label__', '') == '1' and traditional[1] > 0.6: vips[604330] = round(2 + traditional[1], 2) elif len(vips) > 1: pass else: vip_candidates = sorted((set(support_vips.keys()) | set(classifier_vips.keys())), key=lambda x: -support_vips.get(x, 0.1)*0.1*classifier_vips.get(x, 0.01)) if len(vip_candidates) == 0: pass elif len(vip_candidates) == 1: vip = vip_candidates[0] vips[vip] = max(2.9, round(2 + support_vips.get(vip, 0.01) * classifier_vips.get(vip, 0.01), 2)) else: for rank, vip in enumerate(vip_candidates): # print rank, vip, support_vips.get(vip, 0.01), classifier_vips.get(vip, 0.01) rank_discount = {0: 1, 1: 0.3}.get(rank, 0.2) if support_vips.get(vip, 0.01)*rank_discount + classifier_vips.get(vip, 0.01) > self.vip_threshold: vips[vip] = round(max(2.9-rank*0.01, 2+support_vips.get(vip, 0.01)*classifier_vips.get(vip, 0.01)), 2) for tid, weight in vips.items(): if self.replacements.get(tid): for rtid in self.replacements.get(tid, []): dbutil.update_company_tag(self.db, cid, rtid, weight) else: dbutil.update_company_tag(self.db, cid, tid, weight) # print dbutil.get_tag_info(self.db, tid, 'name'), tid, weight return vips.keys()
def __deduct_2nd(self, tags): deduct = [] tags = [(dbutil.get_tag_id(self.db, t)[0], t) for t in tags.keys()] for (tid, tag) in tags: if self.tag_types.get(tag, 0) == 11013: t1s = dbutil.get_hypernym_tags(self.db, tid, 1) for t1 in set(t1s) & set([t[0] for t in tags]): t2s = set(dbutil.get_hyponym_tags(self.db, t1, 2)) & set( dbutil.get_hypernym_tags(self.db, tid, 2)) for t2 in t2s: if t2 not in set([t[0] for t in tags]): deduct.append(t2) return {dbutil.get_tag_name(self.db, t2): 2.49 for t2 in deduct}
def score(): db = dbcon.connect_torndb() with codecs.open('dumps/rank', 'w', 'utf-8') as fo: for tag in [u'大数据', u'小程序', u'短视频', u'民宿', u'足球', u'咖啡']: cids = [] tid = dbutil.get_tag_id(db, tag)[0] complete = db.query( 'select rel.companyId cid from company_tag_rel rel, company_scores s ' 'where (rel.active="Y" or rel.active is null) and rel.companyId=s.companyId ' 'and s.type=37010 and tagId=%s order by score desc limit 100;', tid) cids.extend([c.cid for c in complete]) yellows = db.query( 'select companyId cid, count(*) c from company_tag_rel rel, tag ' 'where tag.id=tagId and tag.type=11100 and (tag.active is null or tag.active="Y") ' 'and (rel.active="Y" or rel.active is null) and companyId in ' '(select distinct companyId from company_tag_rel where tagId=%s ' 'and (active is null or active="Y")) group by companyId order by c desc limit 100;', tid) cids.extend([c.cid for c in yellows]) msgs = db.query( 'select msg.companyId cid, count(*) c from company_message msg, company_tag_rel rel ' 'where msg.active="Y" and msg.companyId=rel.companyId and msg.publishTime>"2018-02-01" ' 'and rel.tagId=%s and (rel.active="Y" or rel.active is null) group by msg.companyId ' 'order by c desc limit 100;', tid) cids.extend([c.cid for c in msgs]) cids = set(cids) for cid in cids: name = dbutil.get_company_name(db, cid) brief = dbutil.get_company_brief(db, cid) url = 'http://www.xiniudata.com/#/company/%s/overview' % dbutil.get_company_code( db, cid) s1 = dbutil.get_company_score(db, cid, 37010) s1 = 1 if s1 >= 0.5 else s1 s2 = (len(dbutil.get_company_tags_yellow(db, cid, False)) + 1 - dbutil.get_company_yellow_time_deduction(db, cid)) / 9 s3 = (log10( len(dbutil.get_company_messages(db, cid, 'Y', '2018-02-01')) + 1)) / 4 s4 = db.get( 'select confidence from company_tag_rel where companyId=%s and tagId=%s;', cid, tid).confidence fo.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (tag, name, brief, url, s1, round(s2, 2), round( s3, 2), round(s4, 2)))
def update_contents_tags(self, cid, tags, source_tags, vips, topn): """ normalize contents based tags and update mysql """ old_tags = dbutil.get_company_tags_old(self.db, cid) new_tags = [] for tag, weight in sorted(tags.items(), key=lambda x: -x[1])[:topn]: tid, active = dbutil.get_tag_id(self.db, tag) if tag in source_tags: weight += 0.009 if tid in self.vip_tags.values(): continue if active: new_tags.append(tid) if self.replacements.get(tid): new_tags.remove(tid) for rtid in self.replacements.get(tid, []): if rtid in self.vip_tags.values(): continue dbutil.update_company_tag(self.db, cid, rtid, weight, active=active) new_tags.append(rtid) else: dbutil.update_company_tag(self.db, cid, tid, weight, active=active) # add classifed tags try: content = list(self.feeder.feed_seged(cid)) if u'区块链' not in content: pass else: content = [np.mean([self.w2v[w] for w in content if w in self.w2v], axis=0)] for tid, clf in self.trained_tag_clfs.iteritems(): if clf.predict(content)[0] == 1: dbutil.update_company_tag(self.db, cid, tid, 2.806, verify='N', active='Y') new_tags.append(tid) except Exception, e: logger_tag.exception('Fail to classify, due to %s', e)
def __search_ranklist(self, **kwargs): query = dict(kwargs) start = query.get('start', 0) size = min(query.get('size', 10), self.max_result_size) sort = query.get('sort', 76001) order = query.get('order', 'default') tag = query.get('filter', {}).get('tag') if not tag: return {"company": {"count": 0, "data": [], 'sectors': []}} tag = tag[0] tid = dbutil.get_tag_id(self.db, tag)[0] general_query = UniversalQuery(query.get('input'), query.get('filter')) es_query = general_query.generate_query() logger_universal.info('ES %s, topic %s' % (es_query, tag)) hits = self.es.search(index='xiniudata2', doc_type='universal', body={"query": es_query, "sort": self.__generate_sort_search(sort, order, tid), "from": start, "size": size}) count = hits['hits'].get('total', 0) hits = self.__organize(hits) self.logger.info('Result ready') sector_filters = self.__get_sector_filter(tag, 'tag') return {"company": {"count": count, "data": hits, 'sectors': sector_filters}}