def label_blockchain(): db = dbcon.connect_torndb() feeder = Feeder() w2v = Word2Vec.load(word2vec_model) model_dir = os.path.join(os.path.split(os.path.realpath(__file__))[0], 'models') clf = joblib.load(os.path.join(model_dir, '175747.20180311.model')) for cid in dbutil.get_all_company_id(db): print cid flag = False try: content = list(feeder.feed_seged(cid)) content = [np.mean([w2v[w] for w in content if w in w2v], axis=0)] if u'区块链' not in content: if clf.predict_proba(content)[0][1] > 0.9: dbutil.update_company_tag(db, cid, 175747, 2.806, verify='N', active='Y') flag = True else: if clf.predict(content)[0] == 1: dbutil.update_company_tag(db, cid, 175747, 2.806, verify='N', active='Y') flag = True if dbutil.exist_company_tag(db, cid, 175747) and not flag: dbutil.update_company_tag(db, cid, 175747, 0, verify='N', active='N') except Exception, e: print 'Fail to classify, due to %s', e
def c4_2(today=None): aut = AndroidUpdateCompanyTracker() db = dbcon.connect_torndb() for cid in iter(dbutil.get_all_company_id(db)): aut.feed(cid, today) db.close()
def summary_recruit_all(days=30, recruit_ratio=0.2, recruit_threshold=50): db = dbcon.connect_torndb() mongo = dbcon.connect_mongo() gte = datetime.now() - timedelta(days=days) for cid in dbutil.get_all_company_id(db): recruit_ids = dbutil.get_company_recruitments(db, cid) if not recruit_ids: continue recruits = len( list( mongo.job.job.find({ 'recruit_company_id': { '$in': recruit_ids }, 'updateDate': { '$gte': gte } }))) headcount = dbutil.get_company_headcount_max(db, cid) if headcount and headcount > 0: ratio = round(recruits / float(headcount), 2) if ratio > recruit_ratio: yield cid, ratio else: if recruits > recruit_threshold: yield cid, 1 db.close()
def c1(): global logger_track nct = NewsCompanyTracker() db = dbcon.connect_torndb() for cid in dbutil.get_all_company_id(db): nct.feed(cid) logger_track.info('tracked')
def score_activation_full(): global logger_activation db = dbcon.connect_torndb() scorer = ActivationScorer() for cid in dbutil.get_all_company_id(db): scorer.score(cid) logger_activation.info('%s scored' % cid) db.close()
def extract_all(self): global logger_tag db_back = dbcon.connect_torndb() for cid in dbutil.get_all_company_id(db_back): try: self.extract(cid) logger_tag.info('%s processed' % cid) except Exception, e: logger_tag.exception('%s, %s' % (cid, e))
def dump_full(self): global logger_nlp db = dbcon.connect_torndb() for cid in iter(dbutil.get_all_company_id(db)): try: self.get_similar(cid) logger_nlp.info('%s processed' % cid) except Exception, e: logger_nlp.exception('%s failed, %s' % (cid, e))
def c5001_5002(): """ 5001 股东变更 5002 注册资本变更 update every day """ global logger_track cgt = CompanyGongshangTracker() db = dbcon.connect_torndb() logger_track.info('Processing gongshang change') for cid in dbutil.get_all_company_id(db): cgt.feed(cid, logger_track) db.close()
def c4004(): """ 4004 招聘CTO, COO等核心职位, update every day """ global logger_track crt = CompanyRecruitTracker() db = dbcon.connect_torndb() today = datetime.today() for cid in dbutil.get_all_company_id(db): for feed_back in crt.feed_4004(cid, today): if feed_back: crt.send_company_message_msg(feed_back) logger_track.info('4004 %s' % cid) db.close()
def c4002(): """ 4002 长期无新职位发布, update every week """ global logger_track crt = CompanyRecruitTracker() db = dbcon.connect_torndb() today = datetime.today() for cid in dbutil.get_all_company_id(db): feed_back = crt.feed_4002(cid, today) if feed_back: crt.send_company_message_msg(feed_back) logger_track.info('4002 %s' % cid) db.close()
def dump(self, fetch_model='default'): global logger_nlp db = dbcon.connect_torndb() if fetch_model == 'makeup': all_cids = iter(dbutil.get_all_company_id_makeups(db)) else: all_cids = iter(dbutil.get_all_company_id(db)) for cid in all_cids: try: dbutil.update_company_rels(db, cid, self.generate_comps(cid), feedback_threshold=0.5) logger_nlp.info('%s has similar companies now' % cid) except Exception, e: logger_nlp.exception( 'fail to find similar company for %s, %s' % (cid, e))
def create_indice(self): global logger_universal_index self.__check() db = dbcon.connect_torndb() self.topic_tags = dbutil.get_topic_corresponding_tags(db) logger_universal_index.info('Start to create indice') logger_universal_index.info(str(self.es.info())) logger_universal_index.info('ES Config %s' % str(tsbconfig.get_es_config())) for cid in dbutil.get_all_company_id(db): try: self.create_single(db, cid) logger_universal_index.info( '%s index created, %s' % (cid, dbutil.get_company_name(db, cid))) except Exception, e: logger_universal_index.exception('%s failed # %s' % (cid, e))
def classify_founder(): global logger_yl db = dbcon.connect_torndb() fs = FounderScorer() for cid in iter(dbutil.get_all_company_id(db)): score = fs.score(cid) if score >= 0.5: # 309128 团队优秀 dbutil.update_company_tag(db, cid, 309128, score, "Y") logger_yl.info('Outstanding team: %s insert' % cid) # if fs.has_QBFJ(cid): # # 清北复交团队 # logger_yl.info('Has QBFJ: %s insert' % cid) # if fs.has_overseas(cid): # # 海归团队 # logger_yl.info('Has overseas: %s insert' % cid) # if fs.has_serial_entrepreneur(cid): # # 连续创业者 # logger_yl.info('Has serial entrepreneur' % cid) db.close()
def __iter__(self): global description_len_threshold, complete_threshold db = dbcon.connect_torndb() index = 0 for cid in iter(dbutil.get_all_company_id(db)): contents = self.feeder.feed_string(cid) score = dbutil.get_company_score(db, cid) if not (score and score > complete_threshold): continue if int(cid) > self.max_id: self.max_id = int(cid) words = list(self.segmenter.cut(contents)) if not words: continue words = self.default_filter(words) if len(words) < description_len_threshold: continue self.mapping_id2in[cid] = index self.mapping_in2id[index] = cid index += 1 yield [word.lower() for word in words] db.close()
self.__check() db = dbcon.connect_torndb() self.logger.info('Start to create indice') self.logger.info(str(self.es.info())) self.logger.info('ES Config %s' % str(tsbconfig.get_es_config())) try: self.logger.info('Start to create location & tag indice') self.create_indice_completion_locations(db) self.create_indice_completion_keywords(db) except Exception, e: self.logger.exception('location indice & tag failed') self.logger.exception(e) for cid in dbutil.get_all_company_id(db): try: self.create_single(db, cid) self.logger.info('%s index created, %s' % (cid, dbutil.get_company_name(db, cid))) except Exception, e: self.logger.exception('%s failed # %s' % (cid, e)) db.close() def create_single(self, db, cid): """ create a single index for a particular company, completion id consists of its type and original id, including cxxxx, fxxx, axxxx, pxxxx, nxxxx, standing for company, full, artifact, product, nick kxxxx, keyword """