def update(name): mongo = db.connect_mongo() collection_name = mongo.info.gongshang_name collection = mongo.info.gongshang collection.delete_one({"name": name}) collection_name.update_one({"name": name}, {'$set': { "lastCheckTime": None }}) mongo.close()
def save_error(name, value): mongo = db.connect_mongo() collection = mongo.raw.count item = collection.find_one({'count': 'fullname'}) if item.has_key(name): try: collection.update({'count': 'fullname'}, {"$set": {name: value}}) except Exception as e: print("mongo error: %s" % e) mongo.close()
def save_data(investor, data, year): mongo = db.connect_mongo() collection = mongo.raw.qmp_tz data.update({'investor': investor}) data.update({'year': year}) try: collection.insert_one(data) logger.info('save investor:%s | year:%s data done' % (data, year)) except Exception, e: logger.info('mongo error:%s' % e)
def feed_doc(tag=u'金融'): mongo = dbcon.connect_mongo() segmenter = Segmenter(tag=True) wfilter = get_default_filter() for record in mongo.article.news.find({'tags': tag}): yield chain(*[ wfilter(segmenter.cut(piece['content'].strip())) for piece in record['contents'] if piece['content'].strip() ])
def __init__(self, tpid): self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.tpid = tpid self.features = self.__get_features() self.tags = dbutil.get_topic_relevant_tags(self.db, self.tpid) topic_info = dbutil.get_topic_info(self.db, self.tpid) self.auto_expand = topic_info.autoExpand self.rules = self.__parse_topic_rule(topic_info.rule)
def __init__(self, idid): self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.idid = idid self.tag = dbutil.get_industry_tag(self.db, self.idid) industry_info = dbutil.get_industry_info(self.db, self.idid) self.auto_expand = (industry_info.autoExpand == 'Y') self.company_rules = self.__parse_company_rule( industry_info.rule4company)
def add_newsdate(link, bdate): mongo = db.connect_mongo() collection_news = mongo.article.news item = collection_news.find_one({"link": link}) # logger.info("companyId:%s", companyId) if item is not None and item.has_key("source") and item["source"] == 13900: if item["date"] > bdate: collection_news.update_one({"_id": item["_id"]}, {'$set': {"date": bdate}}) logger.info("update date %s into %s", bdate, link) mongo.close()
def dump_domain(): count = {} mongo = dbcon.connect_mongo() for item in mongo.article.news.find({}): netloc = urlparse.urlparse(item['link']).netloc count[netloc] = count.get(netloc, 0) + 1 results = {item[0]: item[1] for item in count.iteritems() if item[1] > 1} with codecs.open('dumps/news.domain', 'w', 'utf-8') as fo: fo.write(json.dumps(results))
def add_companyIds(link, companyId): mongo = db.connect_mongo() collection_news = mongo.article.news item = collection_news.find_one({"link": link}) # logger.info("companyId:%s", companyId) if item is not None and item.has_key("companyIds") and companyId not in item["companyIds"]: collection_news.update_one({"_id": item["_id"]}, {'$set': {"companyId": int(companyId), "processStatus": 1}, '$addToSet': {"companyIds": int(companyId)}}) logger.info("add companyId %s into %s", companyId, link) mongo.close()
def save_unionid_item(unionid, date, active=1, used=0, inuse=0, dayadd=0, proxy=1): """ data 保存成功的 对 date 和 daytimes 进行实时更新 请求失败的 重置状态 :param unionid: :param date: :param active: :param used: :param inuse: :param dayadd: :param proxy: :return: """ mongo = db.connect_mongo() collection = mongo.raw.qmp_id try: item = collection.find_one({'unionid': unionid}) if item is not None: if proxy == 0: proxy = get_proxy() collection.update({'unionid': unionid}, { '$set': { 'active': active, 'used': used, 'inuse': inuse, 'proxy': proxy, 'date': datetime.datetime.now() } }) item = collection.find_one({'unionid': unionid}) mongo.close() return item else: daytimes = item['daytimes'] collection.update({'unionid': unionid}, { '$set': { 'active': active, 'used': used, 'inuse': inuse, 'daytimes': daytimes + dayadd, 'ondate': date, 'date': datetime.datetime.now() } }) mongo.close() except: logger.info('mongo error') mongo.close()
def save_message_4_deal(msg): mongo = db.connect_mongo() collection_m4d = mongo.message.message_4_deal item = collection_m4d.find_one({"deal_id": msg["deal_id"],"deal_log_id": msg["deal_log_id"]}) if item is None: msg["createTime"] = datetime.datetime.now() - datetime.timedelta(hours=8) collection_m4d.insert_one(msg) logger.info("Insert new message_4_deal for deal_id:%s, deal_log_id: %s", msg["deal_id"], msg["deal_log_id"]) else: logger.info("Existed message_4_deal for deal_id:%s, deal_log_id: %s", msg["deal_id"], msg["deal_log_id"]) mongo.close()
def run_tes(): mongo = db.connect_mongo() collection_goshang = mongo.info.gongshang result = list(collection_goshang.find({'name': {'$in': ['深圳华大基因股份有限公司', '北京六合华大基因科技有限公司']}})) # result = list(collection_goshang.find({'name': {'$in': ['北京市爱迪通信有限责任公司', '哎哎信息科技(上海)有限公司']}})) for i in result: logger.info('processing %s', i['name']) newChangeInfo = diff(i, 'changeInfo') newChangeInfo = assign_id(newChangeInfo) id = i['_id'] collection_goshang.update({'_id': id}, {'$set': {'changeInfo': newChangeInfo}})
def take_appid(): mongo = db.connect_mongo() collection = mongo.market.appmini_market try: item = list(collection.find({'source':source}).sort('appid'))[-1] appid = item['appid'] mongo.close() return appid except Exception as e: print('mongo error:%s' % e) mongo.close()
def getinfo(companyId, corporateId): info = "" verfyinfo = "" conn = db.connect_torndb() mongo = db.connect_mongo() cor = conn.query( "select * from corporate where (active is null or active='Y')" " and verify is null and id=%s", corporateId) if len(cor) > 0: verfyinfo += "corporate " comp = conn.query( "select * from company where (active is null or active='Y')" " and verify is null and id=%s", companyId) if len(comp) > 0: verfyinfo += "基本信息 " fundings = conn.query( "select * from funding f left join corporate c on f.corporateId=c.id " "where f.corporateId=%s and (c.active is null or c.active='Y') and " "(f.active is null or f.active='Y') and f.verify is null", corporateId) if len(fundings) > 0: verfyinfo += "融资 " artifacts = conn.query( "select * from artifact where companyId=%s and (active is null or active='Y') " "and verify is null", companyId) if len(artifacts) > 0: verfyinfo += "产品 " members = conn.query( "select cmr.* from company_member_rel cmr left join member m on cmr.memberId=m.id " "where cmr.companyId=%s and m.verify is null and " "(cmr.type = 0 or cmr.type = 5010 or cmr.type = 5020) and " "(cmr.active is null or cmr.active='Y')", companyId) if len(members) > 0: verfyinfo += "团队 " comaliases = conn.query( "select * from company_alias where companyId=%s and (active is null or active='Y')" " and verify is null", companyId) if len(comaliases) > 0: verfyinfo += "产品线短名 " corpaliaes = conn.query( "select * from corporate_alias where (active is null or active='Y') " "and verify is null and corporateId=%s", corporateId) if len(corpaliaes) > 0: verfyinfo += "corporate公司名 " comrecs = conn.query( "select * from company_recruitment_rel where companyId=%s and " "(active is null or active='Y') and verify is null", companyId) if len(comrecs) > 0: verfyinfo += "招聘 " desc = mongo.company.modify.find_one({ 'companyId': companyId, 'sectionName': 'desc' }) if desc is None: verfyinfo += "简介 " conn.close() if len(verfyinfo) > 0: info = verfyinfo + "未verify" else: info = "都verify" logger.info("company: %s->%s", companyId, info) return info
def process(content, flag): # j = json.loads(content) # logger.info(content) global TYPE d = pq(html.fromstring(content.decode("utf-8"))) if content.find('<ol class="clear list" data-information-id=') >= 0: columns = d else: columns = d('.news') cnt = 0 for column in columns: cnt += 1 TYPE = 60001 # if cnt == 2 else 60005 htmls = d(column)('.list') # htmls=[i for i in htmls if i.attr('href').find('/Xfeature/view?aid=')>=0] for a in htmls: # print d(a).html() if d(a)('.list > a').attr('title') is None: continue title = d(a)('.list > a').attr('title').strip() fund_keywords = [u'融资', u'融资方', u'领投', u'跟投', u'投资', u'收购'] for keyword in fund_keywords: if title.find(keyword) >= 0: TYPE = 60001 break link = d(a)('.list > a').attr('href').strip() # link = 'http://www.jinse.com' + d(a)("h4 a").attr('href') post = d(a)('.list > a img').attr('data-original').strip() if d(a)( '.list > a img').attr('data-original') is not None else None mongo = db.connect_mongo() collection_news = mongo.article.news item = collection_news.find_one({"link": link, "title": title}) mongo.close() if (item is None or flag == "all") and link not in links: # if link not in links: linkmap = { "link": link, "post": post, "title": title, "type": TYPE, } # print linkmap URLS.append(linkmap) links.append(link) return len(URLS)
def correct_comps_weight(): mongo = dbcon.connect_mongo() db = dbcon.connect_torndb() for tpc in dbutil.get_topic_companies(db, 57): candidates = mongo.comps.candidates.find_one({ 'company': tpc.companyId }).get('candidates') for candidate in candidates: if len(candidate) < 2: print tpc.companyId, candidate
def __init__(self): self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.non_trusted_discount = 0.5 self.brief_promote = 1.5 self.trusted_sources = dicts.get_known_company_source() self.wfilter = word_filter.get_default_filter() self.seg = Segmenter(tag=True)
def run(crawler, concurrent_num, flag): page = 1 while True: url = 'http://xtecher.com/Website/Search/searchProjectResult?&page=%s' % ( page) while True: result = crawler.crawl(url) if result['get'] == 'success': # if result['content'].find('非常抱歉,没有找到结果')>0: # break d = pq(html.fromstring(result['content'].decode("utf-8"))) for item in d('.contentBox'): title = d(item)('h4').text() href = 'http://xtecher.com' + d(item)( '.leftcontent > a').attr('href') key = href.split('pid=')[-1] # print (title), href, key linkDict = { "href": href, "title": title, } mongo = db.connect_mongo() collection = mongo.raw.projectdata item = collection.find_one({ "source": SOURCE, 'type': TYPE, 'key_int': int(key) }) if item is None or flag == 'all': logger.info('not exists %s ,%s ' % (href, title)) URLS.append(linkDict) else: logger.info('already exists %s , %s', href, title) mongo.close() break if len(URLS) == 0: logger.info('page %s returns no fresh item', page) break threads = [ gevent.spawn(process, crawler) for i in xrange(concurrent_num) ] gevent.joinall(threads) page += 1
def run(): conn = db.connect_torndb() mongo = db.connect_mongo() invs = conn.query("select * from investor where (active is null or active='Y')") # cnt_event = 0 result = [] for inv in invs: # check investor_shortname if inv["name"] is not None and inv["name"].strip() != "": ialias = conn.get("select * from investor_alias where investorId=%s and name=%s and " "(active is null or active != 'N') limit 1", inv["id"], inv["name"]) if ialias is None: insql = "insert investor_alias(investorId,name,type,createTime,modifyTime,createUser) " \ "values(%s,%s,%s,now(),now(),139)" conn.insert(insql, inv["id"], inv["name"], 12020) logger.info("**********add name for %s %s", inv["name"], inv["id"]) allialias = conn.query("select * from investor_alias where investorId=%s and " "(active is null or active != 'N')", inv["id"]) sources = conn.query('''select si.sourceId from investor_source_rel r join source_investor si on si.id=r.sourceInvestorId where r.investorId=%s and (r.active is null or r.active != 'N')''', inv["id"]) map = { 'investorName': inv['name'], 'investorId': inv['id'], '是否上线': inv['online'], 'active': inv['active'], '短名': ','.join([i['name'] for i in allialias if i['type'] == 12020]), '全名': ','.join([i['name'] for i in allialias if i['type'] == 12010]), 'link': ','.join(['https://rong.36kr.com/org/%s'%i['sourceId'] for i in sources]), 'modifyTime': datetime.datetime.now() } for year in [2018, 2017, 2016, 2015, 2014]: map[str(year)] = cal_fundnum(inv, year) result.append(map) # import pandas as pd # df = pd.DataFrame(result) # df.to_excel('funding_cnt.xlsx', index=0) collection = mongo.investor.fundingcnt for i in result: if collection.find_one({'investorId': i['investorId']}) is None: collection.insert_one(i) else: collection.update_one({'investorId': i['investorId']}, {'$set': i}) conn.close() mongo.close()
def main(test=True): global conn global mongo conn = db.connect_torndb_proxy() mongo = db.connect_mongo() init() process_industries() process_users(test) mongo.close() conn.close()
def take_link(): mongo = db.connect_mongo() collection = mongo.market.links item = None try: item = collection.find_one({'check': 'N', 'source':Source}) if item is None: collection.update({},{'$set':{'check':'N','modifytime':datetime.datetime.now()}},multi=True) item = collection.find_one({'check': 'N', 'source': Source}) except Exception,e: logger.info('mongo error:%s'%e)
def clean_dup(): global l mongo = dbcon.connect_mongo() for t in mongo.task.company.aggregate([{'$match': {'processStatus': 0, 'no_share': False, 'taker': None, 'createTime': {'$lt': (datetime.now() - timedelta(days=1))}}}, {'$group': {'_id': '$companyId', 'count': {'$sum': 1}}}, {'$match': {'count': {'$gt': 1}}}]): l.info('Processing %s' % t) for st in sorted(mongo.task.company.find({'companyId': t['_id']}), key=lambda x: x['createTime'])[: -1]: mongo.task.company.update({'_id': st['_id']}, {'$set': {'processStatus': 1, 'taker': 139, 'mark': 'dup201708'}})
def get_code_from_user_log(_id): mongo = db.connect_mongo() user_log = mongo.log.user_log.find_one({"_id": ObjectId(_id)}) mongo.close() if user_log is not None: try: code = user_log["jsonRequest"]["payload"]["code"] return code except: pass return None
def __init__(self, es=None): global logger_amac self.db = dbcon.connect_torndb() self.mongo = dbcon.connect_mongo() self.logger = logger_amac if not es: host, port = tsbconfig.get_es_config() self.es = Elasticsearch([{'host': host, 'port': port}]) else: self.es = es self.logger.info('Coin Client inited')
def step2(proxy, company_name): TYCID = proxy["TYCID"] if proxy.get("token") is None: cookie = "TYCID=%s, tnet=%s" % (TYCID, proxy["ip"]) else: cookie = "TYCID=%s, tnet=%s, token=%s, _utm=%s" % ( TYCID, proxy["ip"], proxy["token"], proxy["utm"]) url = "http://www.tianyancha.com/tongji/%s.json?random=%d" % (quote( company_name.encode("utf8")), int(time.time() * 1000)) headers = { "accept": "application/json", "Tyc-From": "normal", "CheckError": "check", "Cookie": cookie, "Referer": "http://www.tianyancha.com/search?key=%s?checkFrom=searchBox" % company_name } response = request(proxy, url, headers=headers) if response is None: return False try: # html = unicode(response.text, encoding="utf-8", errors='replace') html = response.text except: traceback.print_exc() # logger.info(html) token, _utm = process_tongji(html, company_name) if token is None or _utm is None: logger.info("worker: %s, %s" % (local.worker_no, html)) logger.info("worker: %s, step2: can't get token,_utm" % local.worker_no) return False proxy["token"] = token proxy["utm"] = _utm mongo = db.connect_mongo() mongo.raw.proxy_tyc.update({"_id": proxy["_id"]}, {"$set": { "token": token, "utm": _utm }}) mongo.close() logger.info("worker: %s, step2: TYCID=%s, token=%s, _utm=%s" % (local.worker_no, TYCID, token, _utm)) return True
def process(content, page_crawler, flag): d = pq(html.fromstring(content)) lis = d("ul#river1> li.river-block") lis1 = d("ul#river2> li.river-block") if len(lis1)>= 0: lis.extend(lis1) cnt = 0 #logger.info(lis) for li in lis: l = pq(li) try: title = l("h2.post-title> a").text().strip() href = l("h2.post-title> a").attr("href").strip() news_key = l('li.river-block').attr("id") except: logger.info("No id for:") #logger.info(l) continue if title.find("Crunch Report") >= 0: continue news_url = href news_posttime = l('div.byline> time.timestamp').attr('datetime') logger.info("%s, %s, %s, %s", title, news_key, news_url, news_posttime) mongo = db.connect_mongo() collection_news = mongo.article.news item = collection_news.find_one({"source":SOURCE, "key_int":int(news_key)}) newses = list(collection_news.find({"title": title, "source": {"$ne": SOURCE}})) mongo.close() # cnt +=1 if item is None or flag == "all": craw = True for news in newses: if news.has_key("type") and news["type"] > 0: craw = False break if craw: retry_times = 0 while True: result = page_crawler.crawl(news_url, agent=True) if result['get'] == 'success': #logger.info(result["content"]) try: process_news(result['content'], news_key, news_url, news_posttime) cnt += 1 except Exception,ex: pass logger.exception(ex) break retry_times += 1 if retry_times > 15: break
def process(): while True: mongo = db.connect_mongo() collection = mongo.info.gongshang items = list(collection.find({'invests.name.name': {'$exists': True}})) for item in items: name = item["name"] logger.info("updating %s", name) update(name) break
def feed_doc_s(sid): mongo = dbcon.connect_mongo() tagger = Tagger(tags=True) wfilter = get_default_filter() for record in mongo.article.news.find({'sectors': sid}): yield chain(*[ wfilter([ w[0] for w in tagger.tag(piece['content'].strip()) if w[1] in ('tag', 'itag') ]) for piece in record['contents'] if piece['content'].strip() ])
def send_qmp_email(): print('this time:%s to send email' % datetime.datetime.now()) hour = time.localtime()[3] mongo = db.connect_mongo() collection = mongo.raw.qmp_rz_incr if hour == 8: items = list(collection.find().sort('createtime', -1).limit(50)) else: date = datetime.date.today().strftime('%Y-%m-%d') items = list(collection.find({'date': date})) mongo.close() cnt = len(items) from_alias = 'Hush' reply_alias = 'Hush' reply_email = '*****@*****.**' # to = '*****@*****.**' to = '[email protected];[email protected];[email protected];[email protected];[email protected];[email protected]' print('*******') subject = '企名片日常融资事件' content = '<html>共<b>%d</b>起融资事件,请查看附件</html>' % cnt file = 'qmp_rz_day.xls' wb = xlwt.Workbook() ws = wb.add_sheet('A Work Sheet') ws.write(0, 0, 'Product') ws.write(0, 1, 'Lunci') ws.write(0, 2, 'Date') ws.write(0, 3, 'Source') ws.write(0, 4, 'Jianjie') i = 1 for item in items: product = item.get('product') lunci = item.get('lunci') # Date = item.get('Date') date = item.get('news_time') # date = Date + ' ' + date jianjie = item.get('weiyu').decode('utf-8') source = item.get('qmp_url').decode('utf-8') if len(source) > 255: sources = source else: n = "HYPERLINK" sources = xlwt.Formula(n + '("%s";"%s")' % (source, source)) ws.write(i, 0, product) ws.write(i, 1, lunci) ws.write(i, 2, date) ws.write(i, 3, sources) ws.write(i, 4, jianjie) i += 1 wb.save(file) email_helper.send_mail_file(from_alias, reply_alias, reply_email, to, subject, content, file) print('done')
def method1(investorId, investorName): sql = ''' select iaa.amacId from investor_alias ia join investor_alias_amac iaa on iaa.investorAliasId=ia.id where (ia.active = 'Y' or ia.active is null) and (ia.verify != 'N' or ia.verify is null) and (iaa.active = 'Y' or iaa.active is null) and iaa.amacType='M' and ia.investorID=%s ''' conn = db.connect_torndb_proxy() result = conn.query(sql, investorId) conn.close() mongo = db.connect_mongo() collection_amac = mongo.amac.manager collection_gongshang = mongo.info.gongshang amacIds = [ObjectId(i['amacId']) for i in result] managers = list( collection_amac.find({'_id': { '$in': amacIds }}, {'managerName': 1})) managerNames = [i['managerName'] for i in managers] gongshangs = list( collection_gongshang.find({'name': { '$in': managerNames }})) mongo.close() for g in gongshangs: if g.has_key('invests'): for invest in g['invests']: try: investName = invest['name'].replace("(", "(").replace(")", ")") except: continue conn = db.connect_torndb_proxy() q = conn.query( '''select * from investor_alias ia join investor i on i.id=ia.investorid where (ia.active = 'Y' or ia.active is null) and (i.active = 'Y' or i.active is null) and ia.name=%s''', investName) conn.close() if len(q) == 0: logger.info( 'insert %s into investorid:%s |method:-661 |managerName:%s', investName, investorId, g['name']) comment = u'%s的管理公司:%s,对外投资有该公司' % (investorName, g['name']) add_alias_candidate(investorId, investName, comment)