Example #1
0
def parser_main(): 
    (bytes_to_process,
     files_to_process,
     files_to_ignore) = find_files_to_process()

    for path in files_to_ignore:
        print "Unparseable filename: {0}".format(os.path.basename(path))

    print "Files to process: {0}".format(len(files_to_process))
    print "Bytes to process: {0}".format(pretty_bytes(bytes_to_process))
    print "Continue?"
    user_input = raw_input()
    if not 'yes'.startswith(user_input.lower()):
       return

    transactions = {}

    failed_lines = file(os.path.join(DATA_DIR, 'failed_lines.out'), 'w')
    failed_files = file(os.path.join(DATA_DIR, 'failed_files.out'), 'w')
    
    begin_time = time.time()
    for files_processed, ((filepath, import_date, filesize), bytes_processed) in enumerate(files_to_process, start=1):
        try:
            print
            print "Parsing {0}".format(os.path.basename(filepath))
            file_transactions = parse_file(filepath, import_date)
            for (award_id, t) in file_transactions:
                if award_id not in transactions:
                    transactions[award_id] = t

        except UnicodeDecodeError, error:
            log_error(db, filepath, "Unable to parse file: {0}".format(unicode(error)))

        except KeyboardInterrupt:
            break
Example #2
0
def confirm_download_schedule(schedule):
    """Reports the total number of bytes and total number of files
    to download. Also lists the inaccessible files (based on HEAD
    response). Then asks user to confirm downloading.
    """
    def content_length(tpl):
        return tpl[2][1]

    def status_code(tpl):
        return tpl[2][0]

    def href(tpl):
        return tpl[0]

    def is_OK(tpl):
        return status_code(tpl) == 200

    def not_OK(tpl):
        return status_code(tpl) != 200

    increment = lambda x, _: x + 1
    file_count = (
        schedule >> stream.filter(is_OK) >> stream.reduce(increment, 0))

    bytes_to_download = (
        schedule >> stream.filter(is_OK) >> stream.map(content_length) >> sum)

    inaccessible_files = (schedule >> stream.filter(not_OK) >> list)

    if len(inaccessible_files) > 0:
        print
        print "Some files are inaccessible:"
        for (idx, sched) in enumerate(inaccessible_files):
            print "%d: %d %s" % (idx, status_code(sched), href(sched))

    if bytes_to_download > 0:
        print
        print "Need to download %s in %d files." % (
            pretty_bytes(bytes_to_download), file_count)
        print
        print "Are you sure you want to continue? [Y/n]"
        user_input = raw_input("> ")
        return (user_input.upper() in ("", "Y", "YES"))
    else:
        print
        print "Nothing to download."
        return False
Example #3
0
def confirm_download_schedule(schedule):
    """Reports the total number of bytes and total number of files
    to download. Also lists the inaccessible files (based on HEAD
    response). Then asks user to confirm downloading.
    """

    def content_length(tpl):
        return tpl[2][1]

    def status_code(tpl):
        return tpl[2][0]

    def href(tpl):
        return tpl[0]

    def is_OK(tpl):
        return status_code(tpl) == 200

    def not_OK(tpl):
        return status_code(tpl) != 200

    increment = lambda x, _: x + 1
    file_count = schedule >> stream.filter(is_OK) >> stream.reduce(increment, 0)

    bytes_to_download = schedule >> stream.filter(is_OK) >> stream.map(content_length) >> sum

    inaccessible_files = schedule >> stream.filter(not_OK) >> list

    if len(inaccessible_files) > 0:
        print
        print "Some files are inaccessible:"
        for (idx, sched) in enumerate(inaccessible_files):
            print "%d: %d %s" % (idx, status_code(sched), href(sched))

    if bytes_to_download > 0:
        print
        print "Need to download %s in %d files." % (pretty_bytes(bytes_to_download), file_count)
        print
        print "Are you sure you want to continue? [Y/n]"
        user_input = raw_input("> ")
        return user_input.upper() in ("", "Y", "YES")
    else:
        print
        print "Nothing to download."
        return False
Example #4
0
def offer_resume():
    schedule_path = schedule_file_path()
    if os.path.exists(schedule_path):
        try:
            schedule = restore_schedule()
        except EOFError:
            print "Deleting corrupt download schedule."
            os.remove(schedule_path)
            return False

        size = os.path.getsize(schedule_path)
        print "A download schedule file exists:"
        print "    %s (%s)" % (schedule_path, pretty_bytes(size))
        print "You can either resume or delete this schedule."
        print "What would you like to do?"
        user_input = raw_input("[r]esume or [d]elete> ")
        if "DELETE".startswith(user_input.upper()):
            os.remove(schedule_path)
            return False
        else:
            return True
    else:
        return False
Example #5
0
def offer_resume():
    schedule_path = schedule_file_path()
    if os.path.exists(schedule_path):
        try:
            schedule = restore_schedule()
        except EOFError:
            print "Deleting corrupt download schedule."
            os.remove(schedule_path)
            return False

        size = os.path.getsize(schedule_path)
        print "A download schedule file exists:"
        print "    %s (%s)" % (schedule_path, pretty_bytes(size))
        print "You can either resume or delete this schedule."
        print "What would you like to do?"
        user_input = raw_input("[r]esume or [d]elete> ")
        if 'DELETE'.startswith(user_input.upper()):
            os.remove(schedule_path)
            return False
        else:
            return True
    else:
        return False
Example #6
0
def parser_main():
    (bytes_to_process, files_to_process,
     files_to_ignore) = find_files_to_process()

    for path in files_to_ignore:
        print "Unparseable filename: {0}".format(os.path.basename(path))

    print "Files to process: {0}".format(len(files_to_process))
    print "Bytes to process: {0}".format(pretty_bytes(bytes_to_process))
    print "Continue?"
    user_input = raw_input()
    if not 'yes'.startswith(user_input.lower()):
        return

    transactions = {}

    failed_lines = file(os.path.join(DATA_DIR, 'failed_lines.out'), 'w')
    failed_files = file(os.path.join(DATA_DIR, 'failed_files.out'), 'w')

    begin_time = time.time()
    for files_processed, ((filepath, import_date, filesize),
                          bytes_processed) in enumerate(files_to_process,
                                                        start=1):
        try:
            print
            print "Parsing {0}".format(os.path.basename(filepath))
            file_transactions = parse_file(filepath, import_date)
            for (award_id, t) in file_transactions:
                if award_id not in transactions:
                    transactions[award_id] = t

        except UnicodeDecodeError, error:
            log_error(db, filepath,
                      "Unable to parse file: {0}".format(unicode(error)))

        except KeyboardInterrupt:
            break
Example #7
0
                    transactions[award_id] = t

        except UnicodeDecodeError, error:
            log_error(db, filepath,
                      "Unable to parse file: {0}".format(unicode(error)))

        except KeyboardInterrupt:
            break

        now_time = time.time()
        bytes_per_second = bytes_processed / max(now_time - begin_time, 1)
        bytes_processed_pct = bytes_processed * 100 / bytes_to_process
        eta_seconds = (bytes_to_process - bytes_processed) / max(
            bytes_per_second, 1)
        print "{0}/{1} ({2}%), {3}/s, ETA {4}".format(
            pretty_bytes(bytes_processed), pretty_bytes(bytes_to_process),
            bytes_processed_pct, pretty_bytes(bytes_per_second),
            pretty_seconds(eta_seconds))

    failed_lines.close()
    failed_files.close()

    print "Dumping awards dictionary..."
    with file(os.path.join(DATA_DIR, 'cfda_awards.out.bin'), 'wb') as outf:
        pickle.dump(transactions, outf)


def fix_prefix(prefix):
    for stem in [
            'VA', 'DHS', 'HUD', 'USAID', 'DOJ', 'USTREAS', 'DOE', 'DOI',
            'IMLS', 'DOC'
Example #8
0
            for (award_id, t) in file_transactions:
                if award_id not in transactions:
                    transactions[award_id] = t

        except UnicodeDecodeError, error:
            log_error(db, filepath, "Unable to parse file: {0}".format(unicode(error)))

        except KeyboardInterrupt:
            break

        now_time = time.time()
        bytes_per_second = bytes_processed / max(now_time - begin_time, 1)
        bytes_processed_pct = bytes_processed * 100 / bytes_to_process
        eta_seconds = (bytes_to_process - bytes_processed) / max(bytes_per_second, 1)
        print "{0}/{1} ({2}%), {3}/s, ETA {4}".format(
            pretty_bytes(bytes_processed),
            pretty_bytes(bytes_to_process),
            bytes_processed_pct,
            pretty_bytes(bytes_per_second),
            pretty_seconds(eta_seconds))

    failed_lines.close()
    failed_files.close()
   
    print "Dumping awards dictionary..."
    with file(os.path.join(DATA_DIR, 'cfda_awards.out.bin'), 'wb') as outf:
        pickle.dump(transactions, outf)


def fix_prefix(prefix):
    for stem in ['VA', 'DHS', 'HUD', 'USAID', 'DOJ', 'USTREAS', 'DOE', 'DOI', 'IMLS', 'DOC']: