def main(argv=None): usage = "usage: %prog [options] year" parser = OptionParser(usage) parser.add_option("-k", "--kind", dest="kind", type="int", default="1", help="patent type(1-4)") parser.add_option("-i", "--input-dir", dest="input_dir", default="input", help="input directory(save downloaded pages)") parser.add_option("-o", "--output-dir", dest="output_dir", default="output", help="output directory") parser.add_option("-t", "--threads", dest="threads", default="20", help="number of threads") parser.add_option("-T", "--timeout", dest="timeout", default="5", help="connection timeout") parser.add_option("-s", "--start", dest="start", default="1", help="start page") parser.add_option("-e", "--end", dest="end", default="-1", help="end page") parser.add_option("-n", "--dry-run", action="store_true", dest="dry_run", help="show what would have been done") (options, args) = parser.parse_args(argv) if len(args) == 0: parser.error("missing arguments") year = args[0] input_dir = options.input_dir output_dir = options.output_dir if not os.path.isdir(output_dir): os.makedirs(output_dir) try: kind = KINDS[options.kind - 1] except: parser.error("kind should be an integer between 1 and {}". format( len(KINDS))) dry_run = options.dry_run timeout = int(options.timeout) params, pages = init_params(year, kind, input_dir) start = int(options.start) end = int(options.end) if end < 0: end = pages job_queue = JobQueue(1 if dry_run else int(options.threads)) with threaded(job_queue): for i in range(start, end + 1): job_queue.add_task(query, params, year, i, dirname=output_dir, timeout=timeout, dry_run=dry_run) return 0
def main(argv=None): usage = "usage: %prog [options] yearOrId1 [yearOrId2 ...]" parser = OptionParser(usage) parser.add_option("-k", "--kind", dest="kind", type="int", default="1", help="patent type(1-4)") parser.add_option("-K", "--detail-kind", dest="detail_kind", type="int", default="1", help="1: {} 2: {}".format(*DETAIL_KINDS)) parser.add_option("-i", "--input-dir", dest="input_dir", default="input", help="input directory(contains ID files)") parser.add_option("-o", "--output-dir", dest="output_dir", default="output", help="output directory") parser.add_option("-t", "--threads", dest="threads", type="int", default="20", help="number of threads") parser.add_option("-T", "--timeout", dest="timeout", type="int", default="5", help="connection timeout") parser.add_option("-s", "--start", dest="start", type="int", default="0", help="start index") parser.add_option("-e", "--end", dest="end", type="int", default="-1", help="end index") parser.add_option("-c", "--check-level", dest="check_level", type="int", default="1", help="0: no check, 1: check file existence, " "2: check file size") parser.add_option("-n", "--dry-run", action="store_true", dest="dry_run", help="show what would have been done") (options, args) = parser.parse_args() if len(args) == 0: parser.error("missing arguments") get_params, parse = None, None kind = options.kind - 1 kind_str = KINDS[kind] try: detail_kind = DETAIL_KINDS[options.detail_kind - 1] get_params = globals()[detail_kind + "_params"] parse = globals()[detail_kind + "_parse"] except: parser.error("detail_kind should be an integer between 1 and {}". format(len(DETAIL_KINDS))) input_dir = options.input_dir output_dir = options.output_dir timeout = options.timeout start = options.start end = options.end check_level = options.check_level dry_run = options.dry_run job_queue = JobQueue(1 if dry_run else options.threads) with threaded(job_queue): if len(args[0]) == 4: # assumed years for year in args: dirname = os.path.join(output_dir, year) print "start on patents' {}(kind: {}) in year {}".format( detail_kind, kind_str, year) with open(os.path.join(input_dir, year)) as f: i = 1 for line in f: i += 1 if i > start and (end < 0 or i <= end): job_queue.add_task(query, get_params, parse, kind, line.strip(), dirname, timeout=timeout, check_level=check_level, dry_run=dry_run) else: # assumed ids for patent_id in args: print "start on patent {}'s {}(kind: {})".format( patent_id, detail_kind, kind_str) dirname = output_dir job_queue.add_task(query, get_params, parse, kind, patent_id, dirname, timeout=timeout, check_level=check_level, dry_run=dry_run) return 0
def main(argv=None): usage = "usage: %prog [options] year" parser = OptionParser(usage) parser.add_option("-k", "--kind", dest="kind", type="int", default="1", help="patent type(1-4)") parser.add_option("-i", "--input-dir", dest="input_dir", default="input", help="input directory(save downloaded pages)") parser.add_option("-o", "--output-dir", dest="output_dir", default="output", help="output directory") parser.add_option("-t", "--threads", dest="threads", default="20", help="number of threads") parser.add_option("-T", "--timeout", dest="timeout", default="5", help="connection timeout") parser.add_option("-s", "--start", dest="start", default="1", help="start page") parser.add_option("-e", "--end", dest="end", default="-1", help="end page") parser.add_option("-n", "--dry-run", action="store_true", dest="dry_run", help="show what would have been done") (options, args) = parser.parse_args(argv) if len(args) == 0: parser.error("missing arguments") year = args[0] input_dir = options.input_dir output_dir = options.output_dir if not os.path.isdir(output_dir): os.makedirs(output_dir) try: kind = KINDS[options.kind - 1] except: parser.error("kind should be an integer between 1 and {}".format( len(KINDS))) dry_run = options.dry_run timeout = int(options.timeout) params, pages = init_params(year, kind, input_dir) start = int(options.start) end = int(options.end) if end < 0: end = pages job_queue = JobQueue(1 if dry_run else int(options.threads)) with threaded(job_queue): for i in range(start, end + 1): job_queue.add_task(query, params, year, i, dirname=output_dir, timeout=timeout, dry_run=dry_run) return 0
def main(argv=None): usage = "usage: %prog [options] yearOrId1 [yearOrId2 ...]" parser = OptionParser(usage) parser.add_option("-k", "--kind", dest="kind", type="int", default="1", help="patent type(1-4)") parser.add_option("-K", "--detail-kind", dest="detail_kind", type="int", default="1", help="1: {} 2: {}".format(*DETAIL_KINDS)) parser.add_option("-i", "--input-dir", dest="input_dir", default="input", help="input directory(contains ID files)") parser.add_option("-o", "--output-dir", dest="output_dir", default="output", help="output directory") parser.add_option("-t", "--threads", dest="threads", type="int", default="20", help="number of threads") parser.add_option("-T", "--timeout", dest="timeout", type="int", default="5", help="connection timeout") parser.add_option("-s", "--start", dest="start", type="int", default="0", help="start index") parser.add_option("-e", "--end", dest="end", type="int", default="-1", help="end index") parser.add_option("-c", "--check-level", dest="check_level", type="int", default="1", help="0: no check, 1: check file existence, " "2: check file size") parser.add_option("-n", "--dry-run", action="store_true", dest="dry_run", help="show what would have been done") (options, args) = parser.parse_args() if len(args) == 0: parser.error("missing arguments") get_params, parse = None, None kind = options.kind - 1 kind_str = KINDS[kind] try: detail_kind = DETAIL_KINDS[options.detail_kind - 1] get_params = globals()[detail_kind + "_params"] parse = globals()[detail_kind + "_parse"] except: parser.error( "detail_kind should be an integer between 1 and {}".format( len(DETAIL_KINDS))) input_dir = options.input_dir output_dir = options.output_dir timeout = options.timeout start = options.start end = options.end check_level = options.check_level dry_run = options.dry_run job_queue = JobQueue(1 if dry_run else options.threads) with threaded(job_queue): if len(args[0]) == 4: # assumed years for year in args: dirname = os.path.join(output_dir, year) print "start on patents' {}(kind: {}) in year {}".format( detail_kind, kind_str, year) with open(os.path.join(input_dir, year)) as f: i = 1 for line in f: i += 1 if i > start and (end < 0 or i <= end): job_queue.add_task(query, get_params, parse, kind, line.strip(), dirname, timeout=timeout, check_level=check_level, dry_run=dry_run) else: # assumed ids for patent_id in args: print "start on patent {}'s {}(kind: {})".format( patent_id, detail_kind, kind_str) dirname = output_dir job_queue.add_task(query, get_params, parse, kind, patent_id, dirname, timeout=timeout, check_level=check_level, dry_run=dry_run) return 0