def main(specific_folder, delete_all, path): import_ui = ImporterUI(path, specific_folder, delete_all) dir_name = import_ui.source_path() # wipe all data first if import_ui.is_delete_all(): truncate_tables(db, (RegisteredVehicle, )) importer = DatastoreImporter() total = 0 dir_files = glob.glob("{0}/*.csv".format(dir_name)) started = datetime.now() for fname in dir_files: total += importer.import_file(fname) db.session.commit() db.engine.execute( "UPDATE {0} SET city_id = (SELECT id FROM {1} WHERE {0}.search_name = {1}.search_heb) WHERE city_id IS NULL" .format(RegisteredVehicle.__tablename__, City.__tablename__)) logging.info("Total: {0} items in {1}".format(total, time_delta(started)))
def main( batch_size, source, load_start_year=None, ): try: total = 0 started = datetime.now() if source == "s3": if load_start_year is None: now = datetime.now() load_start_year = now.year - 1 logging.info("Importing data from s3...") s3_data_retriever = S3DataRetriever() s3_data_retriever.get_files_from_s3(start_year=load_start_year) delete_cbs_entries(load_start_year, batch_size) for provider_code in [ BE_CONST.CBS_ACCIDENT_TYPE_1_CODE, BE_CONST.CBS_ACCIDENT_TYPE_3_CODE, ]: # TODO: make sure that code does not break if end year does not exist in s3 for year in range(int(load_start_year), s3_data_retriever.current_year + 1): cbs_files_dir = os.path.join( s3_data_retriever.local_files_directory, ACCIDENTS_TYPE_PREFIX + "_" + str(provider_code), str(year), ) logging.info("Importing Directory " + cbs_files_dir) preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir) total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size) shutil.rmtree(s3_data_retriever.local_temp_directory) elif source == "local_dir_for_tests_only": path = "static/data/cbs" import_ui = ImporterUI(path) dir_name = import_ui.source_path() dir_list = glob.glob("{0}/*/*".format(dir_name)) # wipe all the AccidentMarker and Vehicle and Involved data first if import_ui.is_delete_all(): truncate_tables(db, (Vehicle, Involved, AccidentMarker)) for directory in sorted(dir_list, reverse=False): directory_name = os.path.basename(os.path.normpath(directory)) year = directory_name[1:5] if directory_name[0] == "H" else directory_name[0:4] parent_directory = os.path.basename( os.path.dirname(os.path.join(os.pardir, directory)) ) provider_code = get_provider_code(parent_directory) logging.info("Importing Directory " + directory) total += import_to_datastore(directory, provider_code, int(year), batch_size) fill_db_geo_data() failed = [ "\t'{0}' ({1})".format(directory, fail_reason) for directory, fail_reason in failed_dirs.items() ] logging.info( "Finished processing all directories{0}{1}".format( ", except:\n" if failed else "", "\n".join(failed) ) ) logging.info("Total: {0} items in {1}".format(total, time_delta(started))) create_tables() except Exception as ex: print("Exception occured while loading the cbs data: {0}".format(str(ex))) print("Traceback: {0}".format(traceback.format_exc()))
def main( specific_folder, delete_all, path, batch_size, delete_start_date, load_start_year, from_email, username="", password="", email_search_start_date="", from_s3=False, ): try: if not from_email and not from_s3: import_ui = ImporterUI(path, specific_folder, delete_all) dir_name = import_ui.source_path() if specific_folder: dir_list = [dir_name] else: dir_list = glob.glob("{0}/*/*".format(dir_name)) # wipe all the AccidentMarker and Vehicle and Involved data first if import_ui.is_delete_all(): truncate_tables(db, (Vehicle, Involved, AccidentMarker)) elif delete_start_date is not None: delete_cbs_entries(delete_start_date, batch_size) started = datetime.now() total = 0 for directory in sorted(dir_list, reverse=False): directory_name = os.path.basename(os.path.normpath(directory)) year = directory_name[1:5] if directory_name[0] == "H" else directory_name[0:4] if int(year) >= int(load_start_year): parent_directory = os.path.basename( os.path.dirname(os.path.join(os.pardir, directory)) ) provider_code = get_provider_code(parent_directory) logging.info("Importing Directory " + directory) total += import_to_datastore(directory, provider_code, int(year), batch_size) else: logging.info( "Importing only starting year {0}. Directory {1} has year {2}".format( load_start_year, directory_name, year ) ) elif from_s3: logging.info("Importing data from s3...") s3_handler = S3Handler() s3_handler.get_files_from_s3(start_year=load_start_year) """ Should be soon implemented as "delete_entries_from_S3" """ # delete_cbs_entries_from_email(provider_code, year, batch_size) if delete_start_date is not None: delete_cbs_entries(delete_start_date, batch_size) started = datetime.now() total = 0 for provider_code in [ BE_CONST.CBS_ACCIDENT_TYPE_1_CODE, BE_CONST.CBS_ACCIDENT_TYPE_3_CODE, ]: for year in range(int(load_start_year), s3_handler.current_year + 1): cbs_files_dir = os.path.join( s3_handler.local_files_directory, ACCIDENTS_TYPE_PREFIX + "_" + str(provider_code), str(year), ) logging.info("Importing Directory " + cbs_files_dir) preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir) acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data( cbs_files_dir ) total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size) shutil.rmtree(s3_handler.local_temp_directory) else: logging.info("Importing data from mail...") temp_dir = tempfile.mkdtemp() zip_path = importmail_cbs.main(temp_dir, username, password, email_search_start_date) if zip_path is None: logging.info("No new cbs files found") return zip_ref = zipfile.ZipFile(zip_path, "r") cbs_files_dir = os.path.join(temp_dir, "cbsfiles") if not os.path.exists(cbs_files_dir): os.makedirs(cbs_files_dir) zip_ref.extractall(cbs_files_dir) zip_ref.close() preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir) acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data(cbs_files_dir) provider_code, year = get_file_type_and_year(acc_data_file_path) delete_cbs_entries_from_email(provider_code, year, batch_size) started = datetime.now() total = 0 logging.info("Importing Directory " + cbs_files_dir) total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size) shutil.rmtree(temp_dir) fill_db_geo_data() failed = [ "\t'{0}' ({1})".format(directory, fail_reason) for directory, fail_reason in failed_dirs.items() ] logging.info( "Finished processing all directories{0}{1}".format( ", except:\n" if failed else "", "\n".join(failed) ) ) logging.info("Total: {0} items in {1}".format(total, time_delta(started))) create_tables() except Exception as ex: print("Exception occured while loading the cbs data: {0}".format(str(ex))) print("Traceback: {0}".format(traceback.format_exc()))