コード例 #1
0
ファイル: main.py プロジェクト: haluomao/crawler
	def getHomePage(self):
		url = "http://roll.finance.sina.com.cn/finance/zq1/ssgs/index_1.shtml";
		saveFile=TimeUtil.prefix()+"home.txt"
		FileUtil.put(saveFile, '');	
		res=getPage(url)
		for (k,v) in res.items():
			FileUtil.appendline(saveFile, k+'|'+v );
コード例 #2
0
class BaseConfig:
    # 爬取专利速度,每个请求80个,范围是(0,89]
    CRAWLER_SPEED = "80"
    FILE_NAME = "output\专利.xls"
    LOG_FILE_NAME = "log\PatentCrawler{0}.log".format(
        TimeUtil.getFormatTime("%Y%m%d_%H%M%S"))

    # 发明人名称检查
    CHECK_INVENTOR = False
    # 申请人名称检查
    CHECK_PROPOSER = True
コード例 #3
0
class Config:
    BROSWER_NAME = "PhantomJs"
    LOG_FILE_NAME = "log\PatentCrawler{0}.log".format(TimeUtil.getFormatTime("%Y%m%d_%H%M%S"))
    FILE_NAME = "output\专利.xls"
    REJECT_WAY = "您的操作太过频繁,已被网站限制操作\n应对方式:\n(1)重启路由器;\n(2)拔掉网线重新连接;\n(3)重启电脑\n(4)通知管理员采取应对办法"
    AND_STRING = "………………………………………………"

    @staticmethod
    def writeLog(strLog):
        FileUtil(Config.LOG_FILE_NAME, "a+").writeLine(
            TimeUtil.getFormatTime("%Y/%m/%d-%H:%M:%S") + Config.AND_STRING + strLog)

    @staticmethod
    def writeException(strException):
        FileUtil(Config.LOG_FILE_NAME, "a+").writeLine(str(strException))
コード例 #4
0
ファイル: main.py プロジェクト: haluomao/crawler
	def getDetailPage(self):
		srcFile=TimeUtil.prefix()+".txt"
		content=FileUtil.readlines(srcFile)
		for str in content:
			url=str.split('|')[1]
			print url
コード例 #5
0
 def writeLog(strLog):
     FileUtil(Config.LOG_FILE_NAME, "a+").writeLine(
         TimeUtil.getFormatTime("%Y/%m/%d-%H:%M:%S") + Config.AND_STRING + strLog)
コード例 #6
0
    def run(debug, env, username, password, correlate_data_params,
            enrich_data_params):

        correlate_out_dir = correlate_data_params["correlate_out_dir"]
        correlate_out_archive_dir = correlate_data_params[
            "correlate_out_archive_dir"]
        enrich_in_dir = enrich_data_params["enrich_in_dir"]
        enrich_in_archive_dir = enrich_data_params["enrich_in_archive_dir"]
        enrich_out_dir = enrich_data_params["enrich_out_dir"]

        # copy correlate data out to enrich in
        # move correlate out to correlate out archive
        FileUtil.copy_and_move_files(correlate_out_dir, enrich_in_dir,
                                     correlate_out_archive_dir, "*.csv")

        wj_api = WorkjamAPI(debug, env, username, password)

        # now in milliseconds
        now_timestamp = TimeUtil.get_current_milli_time()
        enrich_filename = 'Enrich_' + str(now_timestamp) + ".csv"

        print("\nLoading Data to be enriched from Filesystem...")
        df_enrich = FileUtil.get_df_from_csv_dir(enrich_in_dir, "*.csv")
        print("Complete. Count: " + str(df_enrich.shape[0]))

        # write header to the file

        response_user_header = wj_api.get_user_details(True, '', '')
        response_event_header = wj_api.get_event_details(True, '', '', '')

        FileUtil.write_to_file(
            enrich_out_dir + enrich_filename,
            'loggedin_user,company_id,query_datetime,apply_datetime,number_of_open_shifts,location_id,event_id,'
            + response_user_header + ',' + response_event_header +
            ',applied\n')

        print("\nEnriching User and Event info...")

        num_records_written_to_file = 0

        for index, row in df_enrich.iterrows():
            loggedinuser = row['loggedinuser']
            companyid = row['companyid']
            query_datetime = row['query_datetime']
            apply_datetime = row['apply_datetime']
            numberofopenshifts = row['numberofopenshifts']
            locationid = row['locationid']
            eventid = row['eventid']
            applied = row['applied']

            try:
                # Get Info for the Event in context
                response_event_csv = wj_api.get_event_details(
                    False, companyid, locationid, eventid)

                # Get Info for the User in context
                response_user_csv = wj_api.get_user_details(
                    False, companyid, loggedinuser)

                # # write enriche data to out dir with timestamp
                FileUtil.append_to_file(
                    enrich_out_dir + enrich_filename,
                    str(loggedinuser) + ',' + str(companyid) + ',' +
                    str(query_datetime) + ',' + str(apply_datetime) + ',' +
                    str(numberofopenshifts) + ',' + str(locationid) + ',' +
                    str(eventid) + ',' + response_user_csv + ',' +
                    response_event_csv + ',' + str(applied) + '\n')

                num_records_written_to_file += 1

            except Exception as e:
                print(e)

        print("Complete. Found: {} Written: {}\n".format(
            str(df_enrich.shape[0]), num_records_written_to_file))

        # move enrich in to enrich in archive
        FileUtil.move_files(enrich_in_dir, enrich_in_archive_dir, "*.csv")
コード例 #7
0
import configparser
import os

import click

from util.TimeUtil import TimeUtil

"""
路径设置
"""
# 工程根目录,注意此处以初次调用这个变量的元素为准,工程起始目录定位在main,若有修改请注意这个位置
BASE_PATH = os.path.split(os.path.split(__file__)[0])[0]
# 输出目录
OUTPUT_PATH = os.path.join(BASE_PATH, 'output')
# 输出分组,默认按年月日_时分秒分组
OUTPUT_GROUP_PATH = os.path.join(OUTPUT_PATH, TimeUtil.getFormatTime('%Y%m%d_%H%M%S'))
# 采集存放数据库地址
DATABASE_NAME = os.path.join(OUTPUT_GROUP_PATH, 'Patent.db')
# 生成excel地址
EXCEL_NAME = os.path.join(OUTPUT_GROUP_PATH, '专利.xlsx')
# 生成图表目录
CHARTS_NAME = os.path.join(OUTPUT_GROUP_PATH, 'charts.html')
# log文件名
LOG_FILENAME = os.path.join(OUTPUT_GROUP_PATH, "PatentCrawler.log")
# 验证码模型地址
CAPTCHA_MODEL_NAME = os.path.join(BASE_PATH, 'res', 'captcha', 'sipoknn.job')

"""
基础设置
"""
# 是否使用代理
コード例 #8
0
Created on 2017/3/19

@author: will4906

一、下地址、文件名可根据用户使用自行修改,工程所有地址将会采用。
"""
import os

from util.TimeUtil import TimeUtil

# 工程根目录,注意此处以初次调用这个变量的元素为准,工程起始目录定位在main,若有修改请注意这个位置
BASE_PATH = os.getcwd() + os.sep
# 输出目录
OUTPUT_PATH = BASE_PATH + 'output'
# 输出分组,默认按年月日_时分秒分组
OUTPUT_GROUP_PATH = OUTPUT_PATH + os.sep + TimeUtil.getFormatTime(
    '%Y%m%d_%H%M%S')
# 采集存放数据库地址
DATABASE_NAME = OUTPUT_PATH + os.sep + 'Patent.db'
# 生成excel地址
EXCEL_NAME = OUTPUT_GROUP_PATH + os.sep + '专利.xlsx'
# 生成图表地址
DIAGRAM_NAME = OUTPUT_PATH + os.sep + 'diagram.html'
# log输出目录
LOG_PATH = BASE_PATH + 'log'
# log文件名
LOG_FILENAME = LOG_PATH + os.sep + "PatentCrawler{0}.log".format(
    TimeUtil.getFormatTime("%Y%m%d_%H%M%S"))
# 模板文件目录,不建议修改
TEMPLATE_PATH = BASE_PATH + 'res' + os.sep + 'template'
# 模板文件地址,有可能增加和改变,不建议修改
TEMPLATE_NAME = TEMPLATE_PATH + os.sep + 'template.html'
コード例 #9
0
ファイル: GetData.py プロジェクト: edipdemirbilek/Corvid
    def run(num_days,
            accesId,
            accessKey,
            env,
            get_data_params,
            remove_timestamp_files=False):

        sumologic_timestamp_dir = get_data_params["sumologic_timestamp_dir"]
        sumologic_out_dir = get_data_params["sumologic_out_dir"]

        # now in milliseconds
        now_timestamp = TimeUtil.get_current_milli_time()

        requests_timestamp_filename = 'Requests.timestamp'
        apply_timestamp_filename = 'Apply.timestamp'

        requests_filename = 'Requests_' + str(now_timestamp) + ".csv"
        apply_filename = 'Apply_' + str(now_timestamp) + ".csv"

        # temporary: remove files
        if remove_timestamp_files:
            FileUtil.delete_if_exist(sumologic_timestamp_dir +
                                     requests_timestamp_filename)
            FileUtil.delete_if_exist(sumologic_timestamp_dir +
                                     apply_timestamp_filename)

        # fromTime for open shift requests
        past_requests_timestamp = FileUtil.read_timestamp_or_deafult(
            sumologic_timestamp_dir + requests_timestamp_filename,
            TimeUtil.get_past_milli_time(num_days))

        # fromTime for open shift apply
        past_apply_timestamp = FileUtil.read_timestamp_or_deafult(
            sumologic_timestamp_dir + apply_timestamp_filename,
            TimeUtil.get_past_milli_time(num_days))

        # get open shift requests and write to file
        print("\nDownloading Open Shift Requests from SumoLogic...")
        open_shift_requests = OpenShiftRequestsAPI(accesId, accessKey)
        open_shift_requests.get_sumologic_content(past_requests_timestamp,
                                                  now_timestamp, 10000)

        open_shift_requests.write_response_to_file(sumologic_out_dir +
                                                   requests_filename)
        print("Complete. Results written to " + sumologic_out_dir +
              requests_filename)

        # get open shift apply and write to file
        print("\nDownloading Apply to Open Shifts from SumoLogic...")
        open_shift_apply = OpenShiftApplyAPI(accesId, accessKey)
        open_shift_apply.get_sumologic_content(past_apply_timestamp,
                                               now_timestamp, 10000)

        open_shift_apply.write_response_to_file(sumologic_out_dir +
                                                apply_filename)
        print("Complete. Results written to {}: \n".format(sumologic_out_dir +
                                                           apply_filename))

        print("\nUpdating time stamp files.")
        # write timestamps
        FileUtil.write_timestamp(
            sumologic_timestamp_dir + requests_timestamp_filename,
            now_timestamp)
        FileUtil.write_timestamp(
            sumologic_timestamp_dir + apply_timestamp_filename, now_timestamp)
コード例 #10
0
    def run(debug, get_data_params, correlate_data_params):

        sumologic_out_dir = get_data_params["sumologic_out_dir"]
        sumologic_out_archive_dir = get_data_params["sumologic_out_archive_dir"]
        correlate_in_current_cycle_dir = correlate_data_params["correlate_in_current_cycle_dir"]
        correlate_in_previous_cycle_dir = correlate_data_params["correlate_in_previous_cycle_dir"]
        correlate_in_archive_dir = correlate_data_params["correlate_in_archive_dir"]
        correlate_out_dir = correlate_data_params["correlate_out_dir"]

        # copy sumologic out to correlate in current cycle
        # move sumologic out to sumologic out archive
        FileUtil.copy_and_move_files(sumologic_out_dir,
                                     correlate_in_current_cycle_dir,
                                     sumologic_out_archive_dir, "*.csv")

        # now in milliseconds
        now_timestamp = TimeUtil.get_current_milli_time()
        correlate_filename = 'Correlate_'+str(now_timestamp)+".csv"

        print("\nLoading Open Shift Requests from Filesystem...")
        # correlate apply with requests in current and previoud cycle
        df_requests = FileUtil.get_df_from_csv_dirs(correlate_in_current_cycle_dir,
                                                    correlate_in_previous_cycle_dir,
                                                    "Requests*")
        print("Complete. Count: " + str(df_requests.shape[0]))
        if(debug):
            for index, row in df_requests.iterrows():
                print(row)

        print("\nLoading Apply to Open Shifts from Filesystem...")
        df_apply = FileUtil.get_df_from_csv_dir(correlate_in_current_cycle_dir,
                                                    "Apply*")
        print("Complete. Count: " + str(df_apply.shape[0]))

        print("\nCorrelating Apply Open Shifts with Open Shifts Requests... ")

        fields = ['loggedinuser', 'companyid',
                  'query_datetime', 'apply_datetime', 'numberofopenshifts',
                  'locationid', 'eventid', 'applied']

        CorrelateData.add_header(correlate_out_dir+correlate_filename, fields)

        for index, row in df_apply.iterrows():

            apply_datetime = row['datetime']
            loggedinuser = row['loggedinuser']
            companyid = row['companyid']
            locationid = row['locationid']
            eventid = row['eventid']

            df_filtered = df_requests.loc[
                    (df_requests['loggedinuser'] == loggedinuser) &
                    (df_requests['companyid'] == companyid) &
                    (df_requests['datetime'] < apply_datetime) &
                    (df_requests['eventandlocationids'].str.contains(str(eventid)+","+str(locationid)))
                    ].drop_duplicates().sort_values(by=['datetime'], ascending=False).head(1)

            if df_filtered.shape[0] > 0:

                # lets first get rid of ', ' and replace it with '|' and then split
                # Example text: (3714cb1e-4839-4d8c-818e-9d01c655cd86,328038), (d87a2bb7-05e0-465e-8b6c-aa18d89a9c9f,328038), (e7bee5c5-8f4e-457f-95e7-b1ec82e8ab21,328038), (f04d14c1-68c3-4dda-8698-3d95eb3a4b9d,328038)
                events_and_locations = df_filtered.iloc[0]['eventandlocationids'].replace(', ','|').split('|')

                for event_location in events_and_locations:

                    # lets get rid of paranthesis and split text by ','
                    # Example text: (3714cb1e-4839-4d8c-818e-9d01c655cd86,328038)
                    eventid_in_request, locationid_in_request = event_location.replace('(','').replace(')','').split(',')

                    applied = False
                    if str(eventid) == str(eventid_in_request) and str(locationid) == str(locationid_in_request):
                        applied = True

                    row = {'loggedinuser': loggedinuser,
                           'companyid': companyid,
                           'query_datetime': df_filtered.iloc[0]['datetime'],
                           'apply_datetime': apply_datetime,
                           'numberofopenshifts': df_filtered.iloc[0]['numberofopenshifts'],
                           'locationid': locationid_in_request,
                           'eventid': eventid_in_request,
                           'applied': applied}

                    CorrelateData.add_row(correlate_out_dir+correlate_filename, fields, row)

        print("Complete. Results written to: {} \n".format(correlate_out_dir+correlate_filename))

        # move correlate in previous cycle to correlate in archive
        FileUtil.move_files(correlate_in_previous_cycle_dir,
                            correlate_in_archive_dir, "*.csv")

        # move correlate in current cycle (Apply) to
        # correlate in archive cycle
        FileUtil.move_files(correlate_in_current_cycle_dir,
                            correlate_in_archive_dir, "Apply*")

        # move correlate in current cycle (Requests) to
        # correlate in previous cycle
        FileUtil.move_files(correlate_in_current_cycle_dir,
                            correlate_in_previous_cycle_dir, "Requests*")
コード例 #11
0
import configparser
import os

import click

from util.TimeUtil import TimeUtil
"""
路径设置
"""
# 工程根目录,注意此处以初次调用这个变量的元素为准,工程起始目录定位在main,若有修改请注意这个位置
BASE_PATH = os.path.split(os.path.split(__file__)[0])[0]
# 输出目录
OUTPUT_PATH = os.path.join(BASE_PATH, 'output')
# 输出分组,默认按年月日_时分秒分组
OUTPUT_GROUP_PATH = os.path.join(OUTPUT_PATH,
                                 TimeUtil.getFormatTime('%Y%m%d_%H%M%S'))
# 采集存放数据库地址
DATABASE_NAME = os.path.join(OUTPUT_GROUP_PATH, 'Patent.db')
# 生成excel地址
EXCEL_NAME = os.path.join(OUTPUT_GROUP_PATH, '专利.xlsx')
# 生成图表目录
CHARTS_NAME = os.path.join(OUTPUT_GROUP_PATH, 'charts.html')
# log文件名
LOG_FILENAME = os.path.join(OUTPUT_GROUP_PATH, "PatentCrawler.log")
# 验证码模型地址
CAPTCHA_MODEL_NAME = os.path.join(BASE_PATH, 'res', 'captcha', 'sipo3.job')
# 赞赏html路径
AD_PATH = os.path.join(BASE_PATH, 'res', 'advertisement', 'ad.html')
"""
基础设置
"""
コード例 #12
0
def init_excel_config():
    title_list = [
        "专利类型", "专利名称", "法律状态", "法律状态最后修改日期", "申请公布日/授权公告日", "申请号", "申请日",
        "申请人/专利权人", "发明人"
    ]
    editor = ExcelUtil(Config.FILE_NAME).edit()
    sh = editor.getSheet(0)
    for index, each in enumerate(title_list):
        sh.write(0, index, each)
    editor.commit()
    return


if __name__ == '__main__':
    initProgress()
    # 这句非常重要,提高python的递归深度,否则递归900次就炸了
    sys.setrecursionlimit(1000000)  # 例如这里设置为一百万
    startDate = input("请输入公布日开始日期,如{0}:".format(
        TimeUtil.getFormatTime("%Y-%m-%d")))
    Config.writeLog("程序启动,输入的公布开始日期为{0}".format(startDate))
    init_excel_config()

    progress = ProgressController(Config.BROSWER_NAME)
    Config.writeLog("启动{0}浏览器".format(Config.BROSWER_NAME))
    queryInfo = progress.getQueryInfo()
    queryInfo.setStartDate(startDate)

    progress.startProgress()

    # print(excel)
コード例 #13
0
"""
Created on 2017/3/19

@author: will4906

一下地址、文件名可根据用户使用自行修改,工程所有地址将会采用。
"""
import os

from util.TimeUtil import TimeUtil

# 工程根目录,注意此处以初次调用这个变量的元素为准,工程起始目录定位在main,若有修改请注意这个位置
BASE_PATH = os.getcwd() + os.sep
# 输出目录
OUTPUT_PATH = BASE_PATH + 'output'
# 输出分组,默认按年月日_时分秒分组
OUTPUT_GROUP_PATH = OUTPUT_PATH + os.sep + TimeUtil.getFormatTime('%Y%m%d_%H%M%S')
# 采集存放数据库地址
DATABASE_NAME = OUTPUT_GROUP_PATH + os.sep + 'Patent.db'
# 生成excel地址
EXCEL_NAME = OUTPUT_GROUP_PATH + os.sep + '专利.xlsx'
# 生成图表地址
DIAGRAM_NAME = OUTPUT_GROUP_PATH + os.sep + 'diagram.html'
# log输出目录
LOG_PATH = BASE_PATH + 'log'
# log文件名
LOG_FILENAME = LOG_PATH + os.sep + "PatentCrawler{0}.log".format(TimeUtil.getFormatTime("%Y%m%d_%H%M%S"))
# 模板文件目录,不建议修改
TEMPLATE_PATH = BASE_PATH + 'res' + os.sep + 'template'
# 模板文件地址,有可能增加和改变,不建议修改
TEMPLATE_NAME = TEMPLATE_PATH + os.sep + 'template.html'