def getHomePage(self): url = "http://roll.finance.sina.com.cn/finance/zq1/ssgs/index_1.shtml"; saveFile=TimeUtil.prefix()+"home.txt" FileUtil.put(saveFile, ''); res=getPage(url) for (k,v) in res.items(): FileUtil.appendline(saveFile, k+'|'+v );
class BaseConfig: # 爬取专利速度,每个请求80个,范围是(0,89] CRAWLER_SPEED = "80" FILE_NAME = "output\专利.xls" LOG_FILE_NAME = "log\PatentCrawler{0}.log".format( TimeUtil.getFormatTime("%Y%m%d_%H%M%S")) # 发明人名称检查 CHECK_INVENTOR = False # 申请人名称检查 CHECK_PROPOSER = True
class Config: BROSWER_NAME = "PhantomJs" LOG_FILE_NAME = "log\PatentCrawler{0}.log".format(TimeUtil.getFormatTime("%Y%m%d_%H%M%S")) FILE_NAME = "output\专利.xls" REJECT_WAY = "您的操作太过频繁,已被网站限制操作\n应对方式:\n(1)重启路由器;\n(2)拔掉网线重新连接;\n(3)重启电脑\n(4)通知管理员采取应对办法" AND_STRING = "………………………………………………" @staticmethod def writeLog(strLog): FileUtil(Config.LOG_FILE_NAME, "a+").writeLine( TimeUtil.getFormatTime("%Y/%m/%d-%H:%M:%S") + Config.AND_STRING + strLog) @staticmethod def writeException(strException): FileUtil(Config.LOG_FILE_NAME, "a+").writeLine(str(strException))
def getDetailPage(self): srcFile=TimeUtil.prefix()+".txt" content=FileUtil.readlines(srcFile) for str in content: url=str.split('|')[1] print url
def writeLog(strLog): FileUtil(Config.LOG_FILE_NAME, "a+").writeLine( TimeUtil.getFormatTime("%Y/%m/%d-%H:%M:%S") + Config.AND_STRING + strLog)
def run(debug, env, username, password, correlate_data_params, enrich_data_params): correlate_out_dir = correlate_data_params["correlate_out_dir"] correlate_out_archive_dir = correlate_data_params[ "correlate_out_archive_dir"] enrich_in_dir = enrich_data_params["enrich_in_dir"] enrich_in_archive_dir = enrich_data_params["enrich_in_archive_dir"] enrich_out_dir = enrich_data_params["enrich_out_dir"] # copy correlate data out to enrich in # move correlate out to correlate out archive FileUtil.copy_and_move_files(correlate_out_dir, enrich_in_dir, correlate_out_archive_dir, "*.csv") wj_api = WorkjamAPI(debug, env, username, password) # now in milliseconds now_timestamp = TimeUtil.get_current_milli_time() enrich_filename = 'Enrich_' + str(now_timestamp) + ".csv" print("\nLoading Data to be enriched from Filesystem...") df_enrich = FileUtil.get_df_from_csv_dir(enrich_in_dir, "*.csv") print("Complete. Count: " + str(df_enrich.shape[0])) # write header to the file response_user_header = wj_api.get_user_details(True, '', '') response_event_header = wj_api.get_event_details(True, '', '', '') FileUtil.write_to_file( enrich_out_dir + enrich_filename, 'loggedin_user,company_id,query_datetime,apply_datetime,number_of_open_shifts,location_id,event_id,' + response_user_header + ',' + response_event_header + ',applied\n') print("\nEnriching User and Event info...") num_records_written_to_file = 0 for index, row in df_enrich.iterrows(): loggedinuser = row['loggedinuser'] companyid = row['companyid'] query_datetime = row['query_datetime'] apply_datetime = row['apply_datetime'] numberofopenshifts = row['numberofopenshifts'] locationid = row['locationid'] eventid = row['eventid'] applied = row['applied'] try: # Get Info for the Event in context response_event_csv = wj_api.get_event_details( False, companyid, locationid, eventid) # Get Info for the User in context response_user_csv = wj_api.get_user_details( False, companyid, loggedinuser) # # write enriche data to out dir with timestamp FileUtil.append_to_file( enrich_out_dir + enrich_filename, str(loggedinuser) + ',' + str(companyid) + ',' + str(query_datetime) + ',' + str(apply_datetime) + ',' + str(numberofopenshifts) + ',' + str(locationid) + ',' + str(eventid) + ',' + response_user_csv + ',' + response_event_csv + ',' + str(applied) + '\n') num_records_written_to_file += 1 except Exception as e: print(e) print("Complete. Found: {} Written: {}\n".format( str(df_enrich.shape[0]), num_records_written_to_file)) # move enrich in to enrich in archive FileUtil.move_files(enrich_in_dir, enrich_in_archive_dir, "*.csv")
import configparser import os import click from util.TimeUtil import TimeUtil """ 路径设置 """ # 工程根目录,注意此处以初次调用这个变量的元素为准,工程起始目录定位在main,若有修改请注意这个位置 BASE_PATH = os.path.split(os.path.split(__file__)[0])[0] # 输出目录 OUTPUT_PATH = os.path.join(BASE_PATH, 'output') # 输出分组,默认按年月日_时分秒分组 OUTPUT_GROUP_PATH = os.path.join(OUTPUT_PATH, TimeUtil.getFormatTime('%Y%m%d_%H%M%S')) # 采集存放数据库地址 DATABASE_NAME = os.path.join(OUTPUT_GROUP_PATH, 'Patent.db') # 生成excel地址 EXCEL_NAME = os.path.join(OUTPUT_GROUP_PATH, '专利.xlsx') # 生成图表目录 CHARTS_NAME = os.path.join(OUTPUT_GROUP_PATH, 'charts.html') # log文件名 LOG_FILENAME = os.path.join(OUTPUT_GROUP_PATH, "PatentCrawler.log") # 验证码模型地址 CAPTCHA_MODEL_NAME = os.path.join(BASE_PATH, 'res', 'captcha', 'sipoknn.job') """ 基础设置 """ # 是否使用代理
Created on 2017/3/19 @author: will4906 一、下地址、文件名可根据用户使用自行修改,工程所有地址将会采用。 """ import os from util.TimeUtil import TimeUtil # 工程根目录,注意此处以初次调用这个变量的元素为准,工程起始目录定位在main,若有修改请注意这个位置 BASE_PATH = os.getcwd() + os.sep # 输出目录 OUTPUT_PATH = BASE_PATH + 'output' # 输出分组,默认按年月日_时分秒分组 OUTPUT_GROUP_PATH = OUTPUT_PATH + os.sep + TimeUtil.getFormatTime( '%Y%m%d_%H%M%S') # 采集存放数据库地址 DATABASE_NAME = OUTPUT_PATH + os.sep + 'Patent.db' # 生成excel地址 EXCEL_NAME = OUTPUT_GROUP_PATH + os.sep + '专利.xlsx' # 生成图表地址 DIAGRAM_NAME = OUTPUT_PATH + os.sep + 'diagram.html' # log输出目录 LOG_PATH = BASE_PATH + 'log' # log文件名 LOG_FILENAME = LOG_PATH + os.sep + "PatentCrawler{0}.log".format( TimeUtil.getFormatTime("%Y%m%d_%H%M%S")) # 模板文件目录,不建议修改 TEMPLATE_PATH = BASE_PATH + 'res' + os.sep + 'template' # 模板文件地址,有可能增加和改变,不建议修改 TEMPLATE_NAME = TEMPLATE_PATH + os.sep + 'template.html'
def run(num_days, accesId, accessKey, env, get_data_params, remove_timestamp_files=False): sumologic_timestamp_dir = get_data_params["sumologic_timestamp_dir"] sumologic_out_dir = get_data_params["sumologic_out_dir"] # now in milliseconds now_timestamp = TimeUtil.get_current_milli_time() requests_timestamp_filename = 'Requests.timestamp' apply_timestamp_filename = 'Apply.timestamp' requests_filename = 'Requests_' + str(now_timestamp) + ".csv" apply_filename = 'Apply_' + str(now_timestamp) + ".csv" # temporary: remove files if remove_timestamp_files: FileUtil.delete_if_exist(sumologic_timestamp_dir + requests_timestamp_filename) FileUtil.delete_if_exist(sumologic_timestamp_dir + apply_timestamp_filename) # fromTime for open shift requests past_requests_timestamp = FileUtil.read_timestamp_or_deafult( sumologic_timestamp_dir + requests_timestamp_filename, TimeUtil.get_past_milli_time(num_days)) # fromTime for open shift apply past_apply_timestamp = FileUtil.read_timestamp_or_deafult( sumologic_timestamp_dir + apply_timestamp_filename, TimeUtil.get_past_milli_time(num_days)) # get open shift requests and write to file print("\nDownloading Open Shift Requests from SumoLogic...") open_shift_requests = OpenShiftRequestsAPI(accesId, accessKey) open_shift_requests.get_sumologic_content(past_requests_timestamp, now_timestamp, 10000) open_shift_requests.write_response_to_file(sumologic_out_dir + requests_filename) print("Complete. Results written to " + sumologic_out_dir + requests_filename) # get open shift apply and write to file print("\nDownloading Apply to Open Shifts from SumoLogic...") open_shift_apply = OpenShiftApplyAPI(accesId, accessKey) open_shift_apply.get_sumologic_content(past_apply_timestamp, now_timestamp, 10000) open_shift_apply.write_response_to_file(sumologic_out_dir + apply_filename) print("Complete. Results written to {}: \n".format(sumologic_out_dir + apply_filename)) print("\nUpdating time stamp files.") # write timestamps FileUtil.write_timestamp( sumologic_timestamp_dir + requests_timestamp_filename, now_timestamp) FileUtil.write_timestamp( sumologic_timestamp_dir + apply_timestamp_filename, now_timestamp)
def run(debug, get_data_params, correlate_data_params): sumologic_out_dir = get_data_params["sumologic_out_dir"] sumologic_out_archive_dir = get_data_params["sumologic_out_archive_dir"] correlate_in_current_cycle_dir = correlate_data_params["correlate_in_current_cycle_dir"] correlate_in_previous_cycle_dir = correlate_data_params["correlate_in_previous_cycle_dir"] correlate_in_archive_dir = correlate_data_params["correlate_in_archive_dir"] correlate_out_dir = correlate_data_params["correlate_out_dir"] # copy sumologic out to correlate in current cycle # move sumologic out to sumologic out archive FileUtil.copy_and_move_files(sumologic_out_dir, correlate_in_current_cycle_dir, sumologic_out_archive_dir, "*.csv") # now in milliseconds now_timestamp = TimeUtil.get_current_milli_time() correlate_filename = 'Correlate_'+str(now_timestamp)+".csv" print("\nLoading Open Shift Requests from Filesystem...") # correlate apply with requests in current and previoud cycle df_requests = FileUtil.get_df_from_csv_dirs(correlate_in_current_cycle_dir, correlate_in_previous_cycle_dir, "Requests*") print("Complete. Count: " + str(df_requests.shape[0])) if(debug): for index, row in df_requests.iterrows(): print(row) print("\nLoading Apply to Open Shifts from Filesystem...") df_apply = FileUtil.get_df_from_csv_dir(correlate_in_current_cycle_dir, "Apply*") print("Complete. Count: " + str(df_apply.shape[0])) print("\nCorrelating Apply Open Shifts with Open Shifts Requests... ") fields = ['loggedinuser', 'companyid', 'query_datetime', 'apply_datetime', 'numberofopenshifts', 'locationid', 'eventid', 'applied'] CorrelateData.add_header(correlate_out_dir+correlate_filename, fields) for index, row in df_apply.iterrows(): apply_datetime = row['datetime'] loggedinuser = row['loggedinuser'] companyid = row['companyid'] locationid = row['locationid'] eventid = row['eventid'] df_filtered = df_requests.loc[ (df_requests['loggedinuser'] == loggedinuser) & (df_requests['companyid'] == companyid) & (df_requests['datetime'] < apply_datetime) & (df_requests['eventandlocationids'].str.contains(str(eventid)+","+str(locationid))) ].drop_duplicates().sort_values(by=['datetime'], ascending=False).head(1) if df_filtered.shape[0] > 0: # lets first get rid of ', ' and replace it with '|' and then split # Example text: (3714cb1e-4839-4d8c-818e-9d01c655cd86,328038), (d87a2bb7-05e0-465e-8b6c-aa18d89a9c9f,328038), (e7bee5c5-8f4e-457f-95e7-b1ec82e8ab21,328038), (f04d14c1-68c3-4dda-8698-3d95eb3a4b9d,328038) events_and_locations = df_filtered.iloc[0]['eventandlocationids'].replace(', ','|').split('|') for event_location in events_and_locations: # lets get rid of paranthesis and split text by ',' # Example text: (3714cb1e-4839-4d8c-818e-9d01c655cd86,328038) eventid_in_request, locationid_in_request = event_location.replace('(','').replace(')','').split(',') applied = False if str(eventid) == str(eventid_in_request) and str(locationid) == str(locationid_in_request): applied = True row = {'loggedinuser': loggedinuser, 'companyid': companyid, 'query_datetime': df_filtered.iloc[0]['datetime'], 'apply_datetime': apply_datetime, 'numberofopenshifts': df_filtered.iloc[0]['numberofopenshifts'], 'locationid': locationid_in_request, 'eventid': eventid_in_request, 'applied': applied} CorrelateData.add_row(correlate_out_dir+correlate_filename, fields, row) print("Complete. Results written to: {} \n".format(correlate_out_dir+correlate_filename)) # move correlate in previous cycle to correlate in archive FileUtil.move_files(correlate_in_previous_cycle_dir, correlate_in_archive_dir, "*.csv") # move correlate in current cycle (Apply) to # correlate in archive cycle FileUtil.move_files(correlate_in_current_cycle_dir, correlate_in_archive_dir, "Apply*") # move correlate in current cycle (Requests) to # correlate in previous cycle FileUtil.move_files(correlate_in_current_cycle_dir, correlate_in_previous_cycle_dir, "Requests*")
import configparser import os import click from util.TimeUtil import TimeUtil """ 路径设置 """ # 工程根目录,注意此处以初次调用这个变量的元素为准,工程起始目录定位在main,若有修改请注意这个位置 BASE_PATH = os.path.split(os.path.split(__file__)[0])[0] # 输出目录 OUTPUT_PATH = os.path.join(BASE_PATH, 'output') # 输出分组,默认按年月日_时分秒分组 OUTPUT_GROUP_PATH = os.path.join(OUTPUT_PATH, TimeUtil.getFormatTime('%Y%m%d_%H%M%S')) # 采集存放数据库地址 DATABASE_NAME = os.path.join(OUTPUT_GROUP_PATH, 'Patent.db') # 生成excel地址 EXCEL_NAME = os.path.join(OUTPUT_GROUP_PATH, '专利.xlsx') # 生成图表目录 CHARTS_NAME = os.path.join(OUTPUT_GROUP_PATH, 'charts.html') # log文件名 LOG_FILENAME = os.path.join(OUTPUT_GROUP_PATH, "PatentCrawler.log") # 验证码模型地址 CAPTCHA_MODEL_NAME = os.path.join(BASE_PATH, 'res', 'captcha', 'sipo3.job') # 赞赏html路径 AD_PATH = os.path.join(BASE_PATH, 'res', 'advertisement', 'ad.html') """ 基础设置 """
def init_excel_config(): title_list = [ "专利类型", "专利名称", "法律状态", "法律状态最后修改日期", "申请公布日/授权公告日", "申请号", "申请日", "申请人/专利权人", "发明人" ] editor = ExcelUtil(Config.FILE_NAME).edit() sh = editor.getSheet(0) for index, each in enumerate(title_list): sh.write(0, index, each) editor.commit() return if __name__ == '__main__': initProgress() # 这句非常重要,提高python的递归深度,否则递归900次就炸了 sys.setrecursionlimit(1000000) # 例如这里设置为一百万 startDate = input("请输入公布日开始日期,如{0}:".format( TimeUtil.getFormatTime("%Y-%m-%d"))) Config.writeLog("程序启动,输入的公布开始日期为{0}".format(startDate)) init_excel_config() progress = ProgressController(Config.BROSWER_NAME) Config.writeLog("启动{0}浏览器".format(Config.BROSWER_NAME)) queryInfo = progress.getQueryInfo() queryInfo.setStartDate(startDate) progress.startProgress() # print(excel)
""" Created on 2017/3/19 @author: will4906 一下地址、文件名可根据用户使用自行修改,工程所有地址将会采用。 """ import os from util.TimeUtil import TimeUtil # 工程根目录,注意此处以初次调用这个变量的元素为准,工程起始目录定位在main,若有修改请注意这个位置 BASE_PATH = os.getcwd() + os.sep # 输出目录 OUTPUT_PATH = BASE_PATH + 'output' # 输出分组,默认按年月日_时分秒分组 OUTPUT_GROUP_PATH = OUTPUT_PATH + os.sep + TimeUtil.getFormatTime('%Y%m%d_%H%M%S') # 采集存放数据库地址 DATABASE_NAME = OUTPUT_GROUP_PATH + os.sep + 'Patent.db' # 生成excel地址 EXCEL_NAME = OUTPUT_GROUP_PATH + os.sep + '专利.xlsx' # 生成图表地址 DIAGRAM_NAME = OUTPUT_GROUP_PATH + os.sep + 'diagram.html' # log输出目录 LOG_PATH = BASE_PATH + 'log' # log文件名 LOG_FILENAME = LOG_PATH + os.sep + "PatentCrawler{0}.log".format(TimeUtil.getFormatTime("%Y%m%d_%H%M%S")) # 模板文件目录,不建议修改 TEMPLATE_PATH = BASE_PATH + 'res' + os.sep + 'template' # 模板文件地址,有可能增加和改变,不建议修改 TEMPLATE_NAME = TEMPLATE_PATH + os.sep + 'template.html'