예제 #1
0
def main(argv=None):
    usage = "usage: %prog [options] year"
    parser = OptionParser(usage)

    parser.add_option("-k", "--kind", dest="kind", type="int", default="1",
                      help="patent type(1-4)")
    parser.add_option("-i", "--input-dir", dest="input_dir", default="input",
                      help="input directory(save downloaded pages)")
    parser.add_option("-o", "--output-dir",
                      dest="output_dir", default="output",
                      help="output directory")
    parser.add_option("-t", "--threads", dest="threads", default="20",
                      help="number of threads")
    parser.add_option("-T", "--timeout", dest="timeout", default="5",
                      help="connection timeout")
    parser.add_option("-s", "--start", dest="start", default="1",
                      help="start page")
    parser.add_option("-e", "--end", dest="end", default="-1",
                      help="end page")
    parser.add_option("-n", "--dry-run", action="store_true", dest="dry_run",
                      help="show what would have been done")
    (options, args) = parser.parse_args(argv)
    if len(args) == 0:
        parser.error("missing arguments")

    year = args[0]
    input_dir = options.input_dir
    output_dir = options.output_dir
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    try:
        kind = KINDS[options.kind - 1]
    except:
        parser.error("kind should be an integer between 1 and {}". format(
            len(KINDS)))

    dry_run = options.dry_run
    timeout = int(options.timeout)
    params, pages = init_params(year, kind, input_dir)
    start = int(options.start)
    end = int(options.end)
    if end < 0:
        end = pages
    job_queue = JobQueue(1 if dry_run else int(options.threads))
    with threaded(job_queue):
        for i in range(start, end + 1):
            job_queue.add_task(query, params, year, i, dirname=output_dir,
                               timeout=timeout, dry_run=dry_run)
    return 0
예제 #2
0
def main(argv=None):
    usage = "usage: %prog [options] yearOrId1 [yearOrId2 ...]"
    parser = OptionParser(usage)
    parser.add_option("-k", "--kind", dest="kind", type="int", default="1",
                      help="patent type(1-4)")
    parser.add_option("-K", "--detail-kind", dest="detail_kind", type="int",
                      default="1",
                      help="1: {} 2: {}".format(*DETAIL_KINDS))
    parser.add_option("-i", "--input-dir", dest="input_dir", default="input",
                      help="input directory(contains ID files)")
    parser.add_option("-o", "--output-dir",
                      dest="output_dir", default="output",
                      help="output directory")
    parser.add_option("-t", "--threads", dest="threads", type="int",
                      default="20",
                      help="number of threads")
    parser.add_option("-T", "--timeout", dest="timeout", type="int",
                      default="5",
                      help="connection timeout")
    parser.add_option("-s", "--start", dest="start", type="int", default="0",
                      help="start index")
    parser.add_option("-e", "--end", dest="end", type="int", default="-1",
                      help="end index")
    parser.add_option("-c", "--check-level", dest="check_level", type="int",
                      default="1",
                      help="0: no check, 1: check file existence, "
                      "2: check file size")
    parser.add_option("-n", "--dry-run", action="store_true", dest="dry_run",
                      help="show what would have been done")
    (options, args) = parser.parse_args()
    if len(args) == 0:
        parser.error("missing arguments")

    get_params, parse = None, None
    kind = options.kind - 1
    kind_str = KINDS[kind]
    try:
        detail_kind = DETAIL_KINDS[options.detail_kind - 1]
        get_params = globals()[detail_kind + "_params"]
        parse = globals()[detail_kind + "_parse"]
    except:
        parser.error("detail_kind should be an integer between 1 and {}".
                     format(len(DETAIL_KINDS)))

    input_dir = options.input_dir
    output_dir = options.output_dir
    timeout = options.timeout
    start = options.start
    end = options.end
    check_level = options.check_level
    dry_run = options.dry_run

    job_queue = JobQueue(1 if dry_run else options.threads)
    with threaded(job_queue):
        if len(args[0]) == 4:  # assumed years
            for year in args:
                dirname = os.path.join(output_dir, year)
                print "start on patents' {}(kind: {}) in year {}".format(
                    detail_kind, kind_str, year)
                with open(os.path.join(input_dir, year)) as f:
                    i = 1
                    for line in f:
                        i += 1
                        if i > start and (end < 0 or i <= end):
                            job_queue.add_task(query,
                                               get_params, parse, kind,
                                               line.strip(), dirname,
                                               timeout=timeout,
                                               check_level=check_level,
                                               dry_run=dry_run)
        else:  # assumed ids
            for patent_id in args:
                print "start on patent {}'s {}(kind: {})".format(
                    patent_id, detail_kind, kind_str)
                dirname = output_dir
                job_queue.add_task(query, get_params, parse, kind,
                                   patent_id, dirname, timeout=timeout,
                                   check_level=check_level, dry_run=dry_run)
    return 0
예제 #3
0
def main(argv=None):
    usage = "usage: %prog [options] year"
    parser = OptionParser(usage)

    parser.add_option("-k",
                      "--kind",
                      dest="kind",
                      type="int",
                      default="1",
                      help="patent type(1-4)")
    parser.add_option("-i",
                      "--input-dir",
                      dest="input_dir",
                      default="input",
                      help="input directory(save downloaded pages)")
    parser.add_option("-o",
                      "--output-dir",
                      dest="output_dir",
                      default="output",
                      help="output directory")
    parser.add_option("-t",
                      "--threads",
                      dest="threads",
                      default="20",
                      help="number of threads")
    parser.add_option("-T",
                      "--timeout",
                      dest="timeout",
                      default="5",
                      help="connection timeout")
    parser.add_option("-s",
                      "--start",
                      dest="start",
                      default="1",
                      help="start page")
    parser.add_option("-e", "--end", dest="end", default="-1", help="end page")
    parser.add_option("-n",
                      "--dry-run",
                      action="store_true",
                      dest="dry_run",
                      help="show what would have been done")
    (options, args) = parser.parse_args(argv)
    if len(args) == 0:
        parser.error("missing arguments")

    year = args[0]
    input_dir = options.input_dir
    output_dir = options.output_dir
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    try:
        kind = KINDS[options.kind - 1]
    except:
        parser.error("kind should be an integer between 1 and {}".format(
            len(KINDS)))

    dry_run = options.dry_run
    timeout = int(options.timeout)
    params, pages = init_params(year, kind, input_dir)
    start = int(options.start)
    end = int(options.end)
    if end < 0:
        end = pages
    job_queue = JobQueue(1 if dry_run else int(options.threads))
    with threaded(job_queue):
        for i in range(start, end + 1):
            job_queue.add_task(query,
                               params,
                               year,
                               i,
                               dirname=output_dir,
                               timeout=timeout,
                               dry_run=dry_run)
    return 0
예제 #4
0
def main(argv=None):
    usage = "usage: %prog [options] yearOrId1 [yearOrId2 ...]"
    parser = OptionParser(usage)
    parser.add_option("-k",
                      "--kind",
                      dest="kind",
                      type="int",
                      default="1",
                      help="patent type(1-4)")
    parser.add_option("-K",
                      "--detail-kind",
                      dest="detail_kind",
                      type="int",
                      default="1",
                      help="1: {} 2: {}".format(*DETAIL_KINDS))
    parser.add_option("-i",
                      "--input-dir",
                      dest="input_dir",
                      default="input",
                      help="input directory(contains ID files)")
    parser.add_option("-o",
                      "--output-dir",
                      dest="output_dir",
                      default="output",
                      help="output directory")
    parser.add_option("-t",
                      "--threads",
                      dest="threads",
                      type="int",
                      default="20",
                      help="number of threads")
    parser.add_option("-T",
                      "--timeout",
                      dest="timeout",
                      type="int",
                      default="5",
                      help="connection timeout")
    parser.add_option("-s",
                      "--start",
                      dest="start",
                      type="int",
                      default="0",
                      help="start index")
    parser.add_option("-e",
                      "--end",
                      dest="end",
                      type="int",
                      default="-1",
                      help="end index")
    parser.add_option("-c",
                      "--check-level",
                      dest="check_level",
                      type="int",
                      default="1",
                      help="0: no check, 1: check file existence, "
                      "2: check file size")
    parser.add_option("-n",
                      "--dry-run",
                      action="store_true",
                      dest="dry_run",
                      help="show what would have been done")
    (options, args) = parser.parse_args()
    if len(args) == 0:
        parser.error("missing arguments")

    get_params, parse = None, None
    kind = options.kind - 1
    kind_str = KINDS[kind]
    try:
        detail_kind = DETAIL_KINDS[options.detail_kind - 1]
        get_params = globals()[detail_kind + "_params"]
        parse = globals()[detail_kind + "_parse"]
    except:
        parser.error(
            "detail_kind should be an integer between 1 and {}".format(
                len(DETAIL_KINDS)))

    input_dir = options.input_dir
    output_dir = options.output_dir
    timeout = options.timeout
    start = options.start
    end = options.end
    check_level = options.check_level
    dry_run = options.dry_run

    job_queue = JobQueue(1 if dry_run else options.threads)
    with threaded(job_queue):
        if len(args[0]) == 4:  # assumed years
            for year in args:
                dirname = os.path.join(output_dir, year)
                print "start on patents' {}(kind: {}) in year {}".format(
                    detail_kind, kind_str, year)
                with open(os.path.join(input_dir, year)) as f:
                    i = 1
                    for line in f:
                        i += 1
                        if i > start and (end < 0 or i <= end):
                            job_queue.add_task(query,
                                               get_params,
                                               parse,
                                               kind,
                                               line.strip(),
                                               dirname,
                                               timeout=timeout,
                                               check_level=check_level,
                                               dry_run=dry_run)
        else:  # assumed ids
            for patent_id in args:
                print "start on patent {}'s {}(kind: {})".format(
                    patent_id, detail_kind, kind_str)
                dirname = output_dir
                job_queue.add_task(query,
                                   get_params,
                                   parse,
                                   kind,
                                   patent_id,
                                   dirname,
                                   timeout=timeout,
                                   check_level=check_level,
                                   dry_run=dry_run)
    return 0