def crawler_today2(sometime): # page_idx = 1 step = 100 end_time = datetime.datetime.strptime(sometime, "%Y-%m-%d") start_time = datetime.datetime.strptime(sometime, "%Y-%m-%d") start_time = start_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") end_time = end_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") print u"start_time,end_time:", start_time, end_time filters = { "beginTime": start_time, "endTime": end_time } logging.info(u"start time:%s \t end time:%s" % (start_time, end_time)) start_pos = 0 for res_page in login_new_adr.get_page_list(filters,start_pos, step): page_idx += 1 business_idx = 1 print u'res_page:',res_page for qrow in res_page: print "business idx: %s of %s " % (business_idx , step) business_idx += 1 adr_adr.import_from_html(qrow)
def crawler_adr_fullidlist(sometime): # page_idx = 1 step = 100 end_time = datetime.datetime.strptime(sometime, "%Y-%m-%d") start_time = datetime.datetime.strptime(sometime, "%Y-%m-%d") start_time = start_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") end_time = end_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") filters = { "beginTime": start_time, "endTime": end_time } start_pos = 0 for res_page in login_new_adr.get_page_list(filters,start_pos, step): page_idx += 1 business_idx = 1 for qrow in res_page: business_idx += 1 show_id = qrow["report_id"] bianma = qrow["report_id2"] #编码 fungible_name = qrow["personal_his"] #代报单位 report_unit_name = qrow["report_unit_name"] #报告单位 medic_list = qrow["general_name"] #通用名称,用药集合 adr_list = qrow["adr_name"] #不良反应名称 data_source = qrow["data_source"] #个例来源 report_type = qrow["new_flag"] #报告类型 StateReportDate = qrow["report_date"] #国家中心接收时间 insert_sql = u'insert into adr_full_id_list(report_id,report_id2,personal_his,report_unit_name,general_name,adr_name,data_source,new_flag,report_date)values(%s,%s,%s,%s,%s,%s,%s,%s,%s)' mdrsql.mdr_insert_alone(insert_sql,[show_id,bianma,fungible_name,report_unit_name,medic_list,adr_list,data_source,report_type,StateReportDate])
def crawler_last_month(username, password): #抓取上月的 logging.info(u"开始抓取上月数据!") logincounter = 6 while logincounter: print logincounter loginTag = login2.login() print u"login Tag(last_month):",loginTag if loginTag: break elif logincounter == 1: # print u"6次login(last_month)失败,重新尝试此操作" print u"本次任务已经结束" return else: cookieManager.clear() logincounter = logincounter -1 time.sleep(8) page_idx = 1 step = 100 cur_time = datetime.datetime.today() year = cur_time.year month = cur_time.month - 1 if month == 0 : month = 12 year -= 1 start_time = datetime.datetime(year, month, 1) end_time = datetime.datetime( cur_time.year, cur_time.month, 1 ) - datetime.timedelta(days=1) start_time = start_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") end_time = end_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") filters = { "beginTime": start_time, "endTime": end_time } logging.info(u"start time:%s \t end time:%s" % (start_time, end_time)) start_pos = 0 for res_page in login_new_adr.get_page_list(filters,start_pos, step):# 一次一页,一页10条 print u"page: ",page_idx page_idx += 1 business_idx = 1 for qrow in res_page: # 一次一个案例 print u"business idx: %s of %s " % (business_idx , step) business_idx += 1 adr_adr.import_from_html(qrow)
def crawler_last_month(username, password): #抓取上月的 logging.info(u"开始抓取上月数据!") logincounter = 6 while logincounter: print logincounter loginTag = login2.login() print u"login Tag(last_month):", loginTag if loginTag: break elif logincounter == 1: # print u"6次login(last_month)失败,重新尝试此操作" print u"本次任务已经结束" return else: cookieManager.clear() logincounter = logincounter - 1 time.sleep(8) page_idx = 1 step = 100 cur_time = datetime.datetime.today() year = cur_time.year month = cur_time.month - 1 if month == 0: month = 12 year -= 1 start_time = datetime.datetime(year, month, 1) end_time = datetime.datetime(cur_time.year, cur_time.month, 1) - datetime.timedelta(days=1) start_time = start_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") end_time = end_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") filters = {"beginTime": start_time, "endTime": end_time} logging.info(u"start time:%s \t end time:%s" % (start_time, end_time)) start_pos = 0 for res_page in login_new_adr.get_page_list(filters, start_pos, step): # 一次一页,一页10条 print u"page: ", page_idx page_idx += 1 business_idx = 1 for qrow in res_page: # 一次一个案例 print u"business idx: %s of %s " % (business_idx, step) business_idx += 1 adr_adr.import_from_html(qrow)
def crawler_by_time(username, password,start_date,end_date): #开始抓取指定日期区间数据 logging.info("开始抓取指定日期区间数据!") print u"正在login..." logincounter = 6 while logincounter: print logincounter loginTag = login2.login() print u"login Tag(zoe3):",loginTag if loginTag: break elif logincounter == 1: # print u"6次login(main3)失败,重新尝试此操作" print u"本次任务已经结束" return else: cookieManager.clear() logincounter = logincounter -1 time.sleep(8) page_idx = 1 step = 100 end_time = datetime.datetime.strptime(end_date,"%Y-%m-%d") start_time = datetime.datetime.strptime(start_date,"%Y-%m-%d") start_time = start_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") end_time = end_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") filters = { "beginTime": start_time, "endTime": end_time } logging.info("start time:%s \t end time:%s" % (start_time, end_time)) start_pos = 0 for res_page in login_new_adr.get_page_list(filters,start_pos, step): print "page: ",page_idx page_idx += 1 business_idx = 1 for qrow in res_page: # 一次一个案例 print "business idx: %s of %s " % (business_idx , step) business_idx += 1 try: delete_by_bianma(qrow["report_id2"]) #删除成功则添加 adr_adr.import_from_html(qrow) except Exception as err : print err
def crawler_by_time(username, password, start_date, end_date): #开始抓取指定日期区间数据 logging.info("开始抓取指定日期区间数据!") print u"正在login..." logincounter = 6 while logincounter: print logincounter loginTag = login2.login() print u"login Tag(zoe3):", loginTag if loginTag: break elif logincounter == 1: # print u"6次login(main3)失败,重新尝试此操作" print u"本次任务已经结束" return else: cookieManager.clear() logincounter = logincounter - 1 time.sleep(8) page_idx = 1 step = 100 end_time = datetime.datetime.strptime(end_date, "%Y-%m-%d") start_time = datetime.datetime.strptime(start_date, "%Y-%m-%d") start_time = start_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") end_time = end_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") filters = {"beginTime": start_time, "endTime": end_time} logging.info("start time:%s \t end time:%s" % (start_time, end_time)) start_pos = 0 for res_page in login_new_adr.get_page_list(filters, start_pos, step): print "page: ", page_idx page_idx += 1 business_idx = 1 for qrow in res_page: # 一次一个案例 print "business idx: %s of %s " % (business_idx, step) business_idx += 1 try: delete_by_bianma(qrow["report_id2"]) #删除成功则添加 adr_adr.import_from_html(qrow) except Exception as err: print err
def crawler_first_half_year(username, password): #抓取上半年度的 logging.info(u"开始抓取上半年度数据!") logincounter = 6 while logincounter: print logincounter loginTag = login2.login() print u"login Tag(first_half_year):",loginTag if loginTag: break elif logincounter == 1: # print u"6次login(first_half_year)失败,重新尝试此操作" print u"本次任务已经结束" return else: cookieManager.clear() logincounter = logincounter -1 time.sleep(8) page_idx = 1 step = 100 year = datetime.datetime.today().year last_year_start = datetime.date(year,1,1) last_year_end = datetime.date(year,6,30) start_time = last_year_start.strftime("%a %b %d %Y 00:00:00 GMT+0800") end_time = last_year_end.strftime("%a %b %d %Y 00:00:00 GMT+0800") filters = { "beginTime": start_time, "endTime": end_time } logging.info(u"start time:%s \t end time:%s" % (start_time, end_time)) start_pos = 0 for res_page in login_new_adr.get_page_list(filters,start_pos, step): print u"page: ",page_idx page_idx += 1 business_idx = 1 for qrow in res_page: print u"business idx: %s of %s " % (business_idx , step) business_idx += 1 adr_adr.import_from_html(qrow)
def crawler_last_year(username, password): #抓取上一年度的 logging.info(u"开始抓取上一年度数据!") logincounter = 6 while logincounter: print logincounter loginTag = login2.login() print u"login Tag(last_year):", loginTag if loginTag: break elif logincounter == 1: # print u"6次login(last_year)失败,重新尝试此操作" print u"本次任务已经结束" return else: cookieManager.clear() logincounter = logincounter - 1 time.sleep(8) page_idx = 1 step = 100 year = datetime.datetime.today().year - 1 last_year_start = datetime.date(year, 1, 1) last_year_end = datetime.date(year, 12, 31) start_time = last_year_start.strftime("%a %b %d %Y 00:00:00 GMT+0800 ") end_time = last_year_end.strftime("%a %b %d %Y 00:00:00 GMT+0800 ") filters = {"beginTime": start_time, "endTime": end_time} logging.info(u"start time:%s \t end time:%s" % (start_time, end_time)) start_pos = 0 for res_page in login_new_adr.get_page_list(filters, start_pos, step): # 一次一页,一页10条 print "page: ", page_idx page_idx += 1 business_idx = 1 for qrow in res_page: # 一次一个案例 print u"business idx: %s of %s " % (business_idx, step) business_idx += 1 adr_adr.import_from_html(qrow)
def crawler_today(): # logging.info(u"开始抓取今天数据!") logincounter = 6 while logincounter: print logincounter loginTag = login2.login() print u"login Tag(ADR Today):",loginTag if loginTag: break elif logincounter == 1: # print u"6次login(auto today)失败,重新尝试此操作" print u"本次任务已经结束" return else: cookieManager.clear() logincounter = logincounter -1 time.sleep(8) page_idx = 1 step = 100 start_time = datetime.datetime.today() end_time = start_time + datetime.timedelta(days=1) start_time = start_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") end_time = end_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") filters = { "beginTime": start_time, "endTime": end_time } logging.info(u"start time:%s \t end time:%s" % (start_time, end_time)) start_pos = 0 for res_page in login_new_adr.get_page_list(filters,start_pos, step): page_idx += 1 business_idx = 1 for qrow in res_page: print "business idx: %s of %s " % (business_idx , step) business_idx += 1 adr_adr.import_from_html(qrow)
def crawler_today2(sometime): # page_idx = 1 step = 100 end_time = datetime.datetime.strptime(sometime, "%Y-%m-%d") start_time = datetime.datetime.strptime(sometime, "%Y-%m-%d") start_time = start_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") end_time = end_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") print u"start_time,end_time:", start_time, end_time filters = {"beginTime": start_time, "endTime": end_time} logging.info(u"start time:%s \t end time:%s" % (start_time, end_time)) start_pos = 0 for res_page in login_new_adr.get_page_list(filters, start_pos, step): page_idx += 1 business_idx = 1 print u'res_page:', res_page for qrow in res_page: print "business idx: %s of %s " % (business_idx, step) business_idx += 1 adr_adr.import_from_html(qrow)
def crawler_today(): # logging.info(u"开始抓取今天数据!") logincounter = 6 while logincounter: print logincounter loginTag = login2.login() print u"login Tag(ADR Today):", loginTag if loginTag: break elif logincounter == 1: # print u"6次login(auto today)失败,重新尝试此操作" print u"本次任务已经结束" return else: cookieManager.clear() logincounter = logincounter - 1 time.sleep(8) page_idx = 1 step = 100 start_time = datetime.datetime.today() end_time = start_time + datetime.timedelta(days=1) start_time = start_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") end_time = end_time.strftime("%a %b %d %Y 00:00:00 GMT+0800") filters = {"beginTime": start_time, "endTime": end_time} logging.info(u"start time:%s \t end time:%s" % (start_time, end_time)) start_pos = 0 for res_page in login_new_adr.get_page_list(filters, start_pos, step): page_idx += 1 business_idx = 1 for qrow in res_page: print "business idx: %s of %s " % (business_idx, step) business_idx += 1 adr_adr.import_from_html(qrow)