Beispiel #1
0
def main(specific_folder, delete_all, path):
    import_ui = ImporterUI(path, specific_folder, delete_all)
    dir_name = import_ui.source_path()

    # wipe all data first
    if import_ui.is_delete_all():
        truncate_tables(db, (RegisteredVehicle, ))

    importer = DatastoreImporter()
    total = 0
    dir_files = glob.glob("{0}/*.csv".format(dir_name))
    started = datetime.now()
    for fname in dir_files:
        total += importer.import_file(fname)

    db.session.commit()
    db.engine.execute(
        "UPDATE {0} SET city_id = (SELECT id FROM {1} WHERE {0}.search_name = {1}.search_heb) WHERE city_id IS NULL"
        .format(RegisteredVehicle.__tablename__, City.__tablename__))
    logging.info("Total: {0} items in {1}".format(total, time_delta(started)))
Beispiel #2
0
def main(
    batch_size,
    source,
    load_start_year=None,
):
    try:
        total = 0
        started = datetime.now()
        if source == "s3":
            if load_start_year is None:
                now = datetime.now()
                load_start_year = now.year - 1
            logging.info("Importing data from s3...")
            s3_data_retriever = S3DataRetriever()
            s3_data_retriever.get_files_from_s3(start_year=load_start_year)
            delete_cbs_entries(load_start_year, batch_size)
            for provider_code in [
                BE_CONST.CBS_ACCIDENT_TYPE_1_CODE,
                BE_CONST.CBS_ACCIDENT_TYPE_3_CODE,
            ]:
                # TODO: make sure that code does not break if end year does not exist in s3
                for year in range(int(load_start_year), s3_data_retriever.current_year + 1):
                    cbs_files_dir = os.path.join(
                        s3_data_retriever.local_files_directory,
                        ACCIDENTS_TYPE_PREFIX + "_" + str(provider_code),
                        str(year),
                    )
                    logging.info("Importing Directory " + cbs_files_dir)
                    preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir)
                    total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size)
            shutil.rmtree(s3_data_retriever.local_temp_directory)

        elif source == "local_dir_for_tests_only":
            path = "static/data/cbs"
            import_ui = ImporterUI(path)
            dir_name = import_ui.source_path()
            dir_list = glob.glob("{0}/*/*".format(dir_name))

            # wipe all the AccidentMarker and Vehicle and Involved data first
            if import_ui.is_delete_all():
                truncate_tables(db, (Vehicle, Involved, AccidentMarker))
            for directory in sorted(dir_list, reverse=False):
                directory_name = os.path.basename(os.path.normpath(directory))
                year = directory_name[1:5] if directory_name[0] == "H" else directory_name[0:4]
                parent_directory = os.path.basename(
                    os.path.dirname(os.path.join(os.pardir, directory))
                )
                provider_code = get_provider_code(parent_directory)
                logging.info("Importing Directory " + directory)
                total += import_to_datastore(directory, provider_code, int(year), batch_size)

        fill_db_geo_data()

        failed = [
            "\t'{0}' ({1})".format(directory, fail_reason)
            for directory, fail_reason in failed_dirs.items()
        ]
        logging.info(
            "Finished processing all directories{0}{1}".format(
                ", except:\n" if failed else "", "\n".join(failed)
            )
        )
        logging.info("Total: {0} items in {1}".format(total, time_delta(started)))
        create_tables()
    except Exception as ex:
        print("Exception occured while loading the cbs data: {0}".format(str(ex)))
        print("Traceback: {0}".format(traceback.format_exc()))
Beispiel #3
0
def main(
    specific_folder,
    delete_all,
    path,
    batch_size,
    delete_start_date,
    load_start_year,
    from_email,
    username="",
    password="",
    email_search_start_date="",
    from_s3=False,
):
    try:
        if not from_email and not from_s3:
            import_ui = ImporterUI(path, specific_folder, delete_all)
            dir_name = import_ui.source_path()

            if specific_folder:
                dir_list = [dir_name]
            else:
                dir_list = glob.glob("{0}/*/*".format(dir_name))

            # wipe all the AccidentMarker and Vehicle and Involved data first
            if import_ui.is_delete_all():
                truncate_tables(db, (Vehicle, Involved, AccidentMarker))
            elif delete_start_date is not None:
                delete_cbs_entries(delete_start_date, batch_size)
            started = datetime.now()
            total = 0
            for directory in sorted(dir_list, reverse=False):
                directory_name = os.path.basename(os.path.normpath(directory))
                year = directory_name[1:5] if directory_name[0] == "H" else directory_name[0:4]
                if int(year) >= int(load_start_year):
                    parent_directory = os.path.basename(
                        os.path.dirname(os.path.join(os.pardir, directory))
                    )
                    provider_code = get_provider_code(parent_directory)
                    logging.info("Importing Directory " + directory)
                    total += import_to_datastore(directory, provider_code, int(year), batch_size)
                else:
                    logging.info(
                        "Importing only starting year {0}. Directory {1} has year {2}".format(
                            load_start_year, directory_name, year
                        )
                    )
        elif from_s3:
            logging.info("Importing data from s3...")
            s3_handler = S3Handler()
            s3_handler.get_files_from_s3(start_year=load_start_year)
            """
            Should be soon implemented as "delete_entries_from_S3"
            """
            # delete_cbs_entries_from_email(provider_code, year, batch_size)
            if delete_start_date is not None:
                delete_cbs_entries(delete_start_date, batch_size)
            started = datetime.now()
            total = 0
            for provider_code in [
                BE_CONST.CBS_ACCIDENT_TYPE_1_CODE,
                BE_CONST.CBS_ACCIDENT_TYPE_3_CODE,
            ]:
                for year in range(int(load_start_year), s3_handler.current_year + 1):
                    cbs_files_dir = os.path.join(
                        s3_handler.local_files_directory,
                        ACCIDENTS_TYPE_PREFIX + "_" + str(provider_code),
                        str(year),
                    )
                    logging.info("Importing Directory " + cbs_files_dir)
                    preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir)
                    acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data(
                        cbs_files_dir
                    )
                    total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size)
            shutil.rmtree(s3_handler.local_temp_directory)
        else:
            logging.info("Importing data from mail...")
            temp_dir = tempfile.mkdtemp()
            zip_path = importmail_cbs.main(temp_dir, username, password, email_search_start_date)
            if zip_path is None:
                logging.info("No new cbs files found")
                return
            zip_ref = zipfile.ZipFile(zip_path, "r")
            cbs_files_dir = os.path.join(temp_dir, "cbsfiles")
            if not os.path.exists(cbs_files_dir):
                os.makedirs(cbs_files_dir)
            zip_ref.extractall(cbs_files_dir)
            zip_ref.close()
            preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir)
            acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data(cbs_files_dir)
            provider_code, year = get_file_type_and_year(acc_data_file_path)
            delete_cbs_entries_from_email(provider_code, year, batch_size)
            started = datetime.now()
            total = 0
            logging.info("Importing Directory " + cbs_files_dir)
            total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size)
            shutil.rmtree(temp_dir)

        fill_db_geo_data()

        failed = [
            "\t'{0}' ({1})".format(directory, fail_reason)
            for directory, fail_reason in failed_dirs.items()
        ]
        logging.info(
            "Finished processing all directories{0}{1}".format(
                ", except:\n" if failed else "", "\n".join(failed)
            )
        )
        logging.info("Total: {0} items in {1}".format(total, time_delta(started)))

        create_tables()
    except Exception as ex:
        print("Exception occured while loading the cbs data: {0}".format(str(ex)))
        print("Traceback: {0}".format(traceback.format_exc()))