Exemple #1
0
def get_resource():
    z = zipfile.ZipFile(out_file, "w")
    for resource in resources:
        for row in resource:
            parts_file_name = row["parts_file_name"]
            if not parts_file_name:
                continue
            if row["KnessetNum"] != 20:
                continue
            if row["StartDate"].year not in (2017, 2018):
                continue
            if files_limit and stats["num files"] >= files_limit:
                continue
            yield row
            stats["num files"] += 1
            stats["total size bytes"] += row["parts_file_size"] if row[
                "parts_file_size"] else 0
            if not dry_run:
                content = get_retry_response_content(
                    "https://storage.googleapis.com/knesset-data-pipelines/{}".
                    format(parts_file_name),
                    None,
                    None,
                    None,
                    retry_num=1,
                    num_retries=3,
                    seconds_between_retries=2)
                with temp_file() as filename:
                    with open(filename, "wb") as f:
                        f.write(content)
                    z.write(filename,
                            "{}.csv".format(row["CommitteeSessionID"]))
    z.close()
Exemple #2
0
def parse_protocol(row):
    original_filename, ext, output_filename, full_output_filename, download_filename = get_filenames(
        row)
    if os.path.exists(full_output_filename):
        logging.info('file exists: {}'.format(full_output_filename))
        stats["existing files"] += 1
        filesize = os.path.getsize(full_output_filename)
        crc32c = get_crc32c(full_output_filename)
        logging.info('existing file: {}'.format(full_output_filename))
    elif os.path.exists(download_filename):
        with open(download_filename, "rb") as f:
            with CommitteeMeetingProtocol.get_from_file(f) as protocol:
                os.makedirs(os.path.dirname(full_output_filename),
                            exist_ok=True)
                with utils.temp_file() as temp_filename:
                    with open(temp_filename, "w") as of:
                        if parse_type == "text":
                            of.write(protocol.text)
                        else:
                            csv_writer = csv.writer(of)
                            csv_writer.writerow(["header", "body"])
                            for part in protocol.parts:
                                csv_writer.writerow([part.header, part.body])
                    shutil.copy(temp_filename, full_output_filename)
        filesize = os.path.getsize(full_output_filename)
        crc32c = get_crc32c(full_output_filename)
        logging.info('parsed file: {}'.format(full_output_filename))
        stats["parsed files"] += 1
    else:
        logging.warning('missing document committee session file: {}'.format(
            download_filename))
        ext, output_filename, filesize, crc32c = None, None, 0, None
    return ext, output_filename, filesize, crc32c
def csv_writer(s3, bucket, object_name, public_bucket=False):
    with utils.temp_file() as filename:
        with open(filename, "w") as f:
            yield csv.writer(f)
        write(s3,
              bucket,
              object_name,
              file_name=filename,
              public_bucket=public_bucket)
def parse_protocol(output_filename, protocol):
    full_output_filename = out_path + "/" + output_filename
    os.makedirs(os.path.dirname(full_output_filename), exist_ok=True)
    if parse_type == "text":
        with open(full_output_filename, "w") as of:
            of.write(protocol.text)
            stats["parsed files"] += 1
    elif parse_type == "parts":
        with utils.temp_file() as filename:
            with open(filename, "w") as f:
                csv_writer = csv.writer(f)
                csv_writer.writerow(["header", "body"])
                for part in protocol.parts:
                    csv_writer.writerow([part.header, part.body])
            shutil.copy(filename, full_output_filename)
            stats["parsed files"] += 1
    else:
        raise NotImplementedError
Exemple #5
0
def process_row(row, row_index, resource_descriptor, resource_index,
                parameters, stats):
    if resource_descriptor['name'] == 'kns_documentcommitteesession':
        t = parameters['type']
        row[t + "_protocol_extension"] = None
        row[t + "_parsed_filename"] = None
        row[t + "_filesize"] = 0
        row[t + "_crc32c"] = None
        row[t + "_error"] = None
        if (row['GroupTypeID'] == 23 and row['ApplicationDesc'] == 'DOC'
                and (row["FilePath"].lower().endswith('.doc')
                     or row["FilePath"].lower().endswith('.docx'))):
            document_id = "{}-{}-{}".format(row["GroupTypeID"],
                                            row["DocumentCommitteeSessionID"],
                                            row["ApplicationDesc"])
            original_filename, ext, output_filename, full_output_filename, download_filename, full_output_hash_filename = get_filenames(
                row, parameters)
            if os.path.exists(download_filename) and row.get(
                    'download_crc32c'):
                m = BASE_HASH_OBJ.copy()
                m.update(row['download_crc32c'].encode())
                new_cache_hash = m.hexdigest()
                if os.path.exists(full_output_filename) and os.path.exists(
                        full_output_hash_filename):
                    with open(full_output_hash_filename) as f:
                        old_cache_hash = f.read()
                else:
                    old_cache_hash = None
                if old_cache_hash and new_cache_hash and new_cache_hash == old_cache_hash:
                    stats[t + ": existing files"] += 1
                    row[t + "_protocol_extension"] = ext
                    row[t + "_parsed_filename"] = output_filename
                    row[t +
                        "_filesize"] = os.path.getsize(full_output_filename)
                    row[t + "_crc32c"] = get_crc32c(full_output_filename)
                elif parameters.get('files-limit') and parameters[
                        'files-limit'] <= stats[t + ": parsed files"]:
                    row[t + "_error"] = 'reached files-limit, skipping'
                    stats[t + ": skipped files"] += 1
                else:
                    error_string = None
                    try:
                        with open(download_filename, "rb") as f:
                            with CommitteeMeetingProtocol.get_from_file(
                                    f) as protocol:
                                os.makedirs(
                                    os.path.dirname(full_output_filename),
                                    exist_ok=True)
                                with utils.temp_file() as temp_filename:
                                    with open(temp_filename, "w") as of:
                                        if parameters['type'] == "text":
                                            of.write(protocol.text)
                                        else:
                                            csv_writer = csv.writer(of)
                                            csv_writer.writerow(
                                                ["header", "body"])
                                            for part in protocol.parts:
                                                csv_writer.writerow(
                                                    [part.header, part.body])
                                    shutil.copy(temp_filename,
                                                full_output_filename)
                    except Exception as e:
                        logging.exception(
                            'exception parsing protocol for {}'.format(
                                document_id))
                        try:
                            error_string = str(e)
                        except Exception:
                            error_string = 'unexpected exception'
                    if error_string:
                        row[t + "_error"] = error_string
                        stats[t + ': errored files'] += 1
                    else:
                        row[t + "_protocol_extension"] = ext
                        row[t + "_parsed_filename"] = output_filename
                        row[t + "_filesize"] = os.path.getsize(
                            full_output_filename)
                        row[t + "_crc32c"] = get_crc32c(full_output_filename)
                        stats[t + ": parsed files"] += 1
                        with open(full_output_hash_filename, 'w') as f:
                            f.write(new_cache_hash)
            else:
                row[t + "_error"] = 'missing download file'
                stats[t + ': missing download files'] += 1
    return row
def temp_download(s3, bucket, object_name):
    with utils.temp_file() as file_name:
        download(s3, bucket, object_name, file_name)
        yield file_name