def __find_industry(self, codes): cids = [dbutil.get_id_from_code(self.db, code) for code in codes] idids = [ind.industryId for ind in dbutil.get_company_industries(self.db, cids, True)] inds = [dbutil.get_industry_info(self.db, idid) for idid in idids] return [{"id": ind.id, "code": ind.code, "name": ind.name} for ind in inds if (ind.active is None or ind.active == 'Y')]
def fit_company(self, client): if self.tag: for cid, verify, modify in dbutil.get_company_from_tag( self.db, self.tag, with_verify=True): # active = 'Y' if (verify and verify == 'Y') else 'P' active = 'P' modify = modify if active == 'Y' else 139 dbutil.update_industry_company(self.db, self.idid, cid, active=active, source='tag', modify=modify) if self.company_rules: codes = client.search('topic', query=self.company_rules).get( 'company', {}).get('data', []) if codes: codes.reverse() for code in codes: cid = dbutil.get_id_from_code(self.db, code) dbutil.update_industry_company(self.db, self.idid, cid, source='rule') if self.auto_expand: self.expand()
def __process_visit_web(self, message): cid = dbutil.get_id_from_code(self.db, message.get('id')) uid = int(message.get('detail', -1)) if self.need_verify(cid) and uid != -1 \ and uid not in self.xiniu_users and dbutil.exist_verified_investor(self.db, uid): self.update_task(cid, 'visit_local', message.get('detail'))
def extract_pack(): from key import Extractor e = Extractor() db = dbcon.connect_torndb() for line in open('files/todo.pack'): cid = dbutil.get_id_from_code(db, line.strip()) e.extract(cid) print line.strip()
def dump(): db = dbcon.connect_torndb() feeder = Feeder() mode = 'basic' ftrain = codecs.open('tmp/traditional.%s.train' % mode, 'w', 'utf=8') ftest = codecs.open('tmp/traditional.%s.test' % mode, 'w', 'utf=8') cids = [ c.companyId for c in db.query( "select distinct companyId from company_tag_rel where tagId=604330 " "and verify='Y' and (active is null or active='Y');") ] cids.extend([ dbutil.get_id_from_code(db, code.strip()) for code in codecs.open('files/traditional') ]) for cid in cids: content = ' '.join(feeder.feed_seged_fine(cid)) if randint(0, 4): ftrain.write('__label__1 %s\n' % content) else: ftest.write('__label__1 %s\n' % content) print 'Positive Done' cids2 = [ c.id for c in db.query( "select id from company where modifytime>'2018-03-01' " "and modifyuser is not null order by rand() limit 15000;") ] cids = set(cids) for cid in cids2: if cid in cids: continue content = ' '.join(feeder.feed_seged_fine(cid)) if randint(0, 4): ftrain.write('__label__0 %s\n' % content) # ftrain.write('__label__0 %s %s\n' % (dbutil.get_company_establish_date(db, cid).year, content)) else: ftest.write('__label__0 %s\n' % content) print 'Negetive Done' ftrain.close() ftest.close() db.close()
def infer_rules(self): global logger_tag for t in dbutil.get_ruled_tags(self.db): logger_tag.info('Processing rule for %s' % t.name) try: rule = t.rule.replace(u',', u',').replace(u'(', u'(').replace(u')', u')').replace(u' ', u'').lower() rule = generate_rule_based_query(rule) if rule: codes = self.client.search('topic', query=rule).get('company', {}).get('data', []) if len(codes) > 2000: logger_tag.exception('To many results, %s, %s' % (t.name, len(codes))) else: logger_tag.info('%s processed' % t.name) for code in codes: cid = dbutil.get_id_from_code(self.db, code) if not dbutil.exist_company_tag(self.db, cid, t.id): dbutil.update_company_tag(self.db, cid, t.id, 1.505) except Exception, e: logger_tag.exception('Fail to process tag rules %s, due to %s' % (t.name, e))
def train(): nf = NewsFeatures() db = dbcon.connect_torndb() mongo = dbcon.connect_mongo() vec = DictVectorizer() random_source = [item[0] for item in nf.source.items() if item[1] <= 20] setx, sety = [], [] print 'init' for index, item in enumerate(mongo.article.news.find({'source': 13030}).limit(2000)): if index % 50 == 0: print index link = item.get('link') cid = item.get('companyId') name = dbutil.get_company_name(db, cid) title = item.get('title') content = [] for v in item.get('contents'): if v.get('content') and v.get('content').strip(): content.append(v.get('content').strip()) content = '\n'.join(content) if len(content) > 50: normal = nf.featurize(cid, name=name, title=title, content=content, link=link) setx.append(normal) sety.append(1) # rcid = dbutil.random_company_id(db) # rname = name[:-1] # makeup = nf.featurize(rcid, name=rname, title=title, content=content, # link=choice(random_source), name_update=False) # # print makeup # if makeup.get('content_simi', 0) > 0.02: # print 'makeup', makeup, rcid # print 'mongo', cid, item.get('_id'), normal # setx.append(makeup) # sety.append(0) # load bad cases with codecs.open(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'corpus/news.badcase'), encoding='utf-8') as f: for line in f: try: cid = dbutil.get_id_from_code(db, line.split('\t')[0]) nids = [nid for nid in line.strip().split('\t')[1:] if nid.strip()] except Exception, e: print 'fail to load bad case', line continue for nid in nids: try: item = mongo.article.news.find({'_id': ObjectId(nid)})[0] link = item.get('link') name = dbutil.get_company_name(db, cid) title = item.get('title') content = [] for v in item.get('contents'): if v.get('content') and v.get('content').strip(): content.append(v.get('content').strip()) content = '\n'.join(content) setx.append(nf.featurize(cid, name=name, title=title, content=content, link=link, name_update=False)) sety.append(0) except Exception, e: print 'nid', nid, e