def import_to_datastore(path, batch_size): try: assert batch_size > 0 dir_list = glob.glob("{0}/*".format(path)) for directory in sorted(dir_list, reverse=False): started = datetime.now() delete_traffic_volume_of_directory(directory) traffic_volume_rows = get_traffic_volume_rows(directory) new_items = 0 logging.info("inserting " + str(len(traffic_volume_rows)) + " new traffic data rows") for traffic_volume_chunk in chunks(traffic_volume_rows, batch_size): db.session.bulk_insert_mappings(TrafficVolume, traffic_volume_chunk) db.session.commit() new_items += len(traffic_volume_rows) logging.info("\t{0} items in {1}".format(new_items, time_delta(started))) db.session.commit() return new_items except: error = ("Traffic Volume import succeeded partially with " + str(new_items) + " traffic data rows") raise Exception(error)
def import_to_datastore(directory, provider_code, year, batch_size): """ goes through all the files in a given directory, parses and commits them """ try: assert batch_size > 0 files_from_cbs = get_files(directory) if len(files_from_cbs) == 0: return 0 logging.info("Importing '{}'".format(directory)) started = datetime.now() # import dictionary fill_dictionary_tables(files_from_cbs[DICTIONARY], provider_code, year) new_items = 0 accidents_count = import_accidents(**files_from_cbs) new_items += accidents_count involved_count = import_involved(**files_from_cbs) new_items += involved_count vehicles_count = import_vehicles(**files_from_cbs) new_items += vehicles_count logging.info("\t{0} items in {1}".format(new_items, time_delta(started))) return new_items except ValueError as e: failed_dirs[directory] = str(e) if "Not found" in str(e): return 0 raise (e)
def parse(schools_description_filepath, schools_coordinates_filepath, batch_size): started = datetime.now() total = import_to_datastore( schools_description_filepath=schools_description_filepath, schools_coordinates_filepath=schools_coordinates_filepath, batch_size=batch_size, ) db.session.execute( "UPDATE schools_with_description SET geom = ST_SetSRID(ST_MakePoint(longitude,latitude),4326)\ WHERE geom IS NULL;") logging.info("Total: {0} schools in {1}".format(total, time_delta(started)))
def import_to_datastore(filepath, batch_size): try: assert batch_size > 0 started = datetime.now() schools = get_schools(filepath) new_items = 0 all_existing_schools_ids = set(map(lambda x: x[0], db.session.query(School.id).all())) schools = [school for school in schools if school["id"] not in all_existing_schools_ids] logging.info("inserting " + str(len(schools)) + " new schools") for schools_chunk in chunks(schools, batch_size): db.session.bulk_insert_mappings(School, schools_chunk) db.session.commit() new_items += len(schools) logging.info("\t{0} items in {1}".format(new_items, time_delta(started))) return new_items except: error = "Schools import succeded partially with " + new_items + " schools" raise Exception(error)
def main(specific_folder, delete_all, path): import_ui = ImporterUI(path, specific_folder, delete_all) dir_name = import_ui.source_path() # wipe all data first if import_ui.is_delete_all(): truncate_tables(db, (RegisteredVehicle, )) importer = DatastoreImporter() total = 0 dir_files = glob.glob("{0}/*.csv".format(dir_name)) started = datetime.now() for fname in dir_files: total += importer.import_file(fname) db.session.commit() db.engine.execute( "UPDATE {0} SET city_id = (SELECT id FROM {1} WHERE {0}.search_name = {1}.search_heb) WHERE city_id IS NULL" .format(RegisteredVehicle.__tablename__, City.__tablename__)) logging.info("Total: {0} items in {1}".format(total, time_delta(started)))
def import_to_datastore(schools_description_filepath, schools_coordinates_filepath, batch_size): try: assert batch_size > 0 started = datetime.now() schools = get_schools_with_description(schools_description_filepath, schools_coordinates_filepath) truncate_schools_with_description() new_items = 0 logging.info("inserting " + str(len(schools)) + " new schools") for schools_chunk in chunks(schools, batch_size): db.session.bulk_insert_mappings(SchoolWithDescription, schools_chunk) db.session.commit() new_items += len(schools) logging.info("\t{0} items in {1}".format(new_items, time_delta(started))) return new_items except: error = "Schools import succeded partially with " + new_items + " schools" raise Exception(error)
def parse(filepath, batch_size): started = datetime.now() total = import_to_datastore(filepath, batch_size) logging.info("Total: {0} schools in {1}".format(total, time_delta(started)))
def main(detach_dir, username=None, password=None, email_search_start_date=""): try: username = username or os.environ.get("MAILUSER") password = password or os.environ.get("MAILPASS") if not username: logging.error( "Username not set. Please set env var MAILUSER or use the --username argument" ) if not password: logging.error( "Password not set. Please set env var MAILPASS or use the --password argument" ) if not username or not password: exit() imapsession = imaplib.IMAP4_SSL("imap.gmail.com") try: imapsession.login(username, password) except imaplib.IMAP4.error: logging.error("Bad credentials, unable to sign in!") exit() try: imapsession.select(mail_dir) if email_search_start_date == "": typ, data = imapsession.search(None, "ALL") else: search_start_date = datetime.strptime( email_search_start_date, "%d.%m.%Y").strftime("%d-%b-%Y") typ, data = imapsession.search( None, '(SINCE "{0}")'.format(search_start_date)) except imaplib.IMAP4.error: logging.error("Error searching given mailbox: %s" % mail_dir) exit() file_found = False if not os.path.exists(detach_dir): os.makedirs(detach_dir) total = 0 # Iterating over all emails started = datetime.now() logging.info("Login successful! Importing files, please hold...") filepath = None for msgId in data[0].split(): typ, message_parts = imapsession.fetch(msgId, "(RFC822)") if typ != "OK": logging.error("Error fetching mail.") raise Exception("Error fetching mail") email_body = message_parts[0][1] mail = email.message_from_string(email_body) try: mtime = datetime.strptime(mail["Date"][:-6], "%a, %d %b %Y %H:%M:%S") except ValueError: mtime = datetime.strptime(mail["Date"][:-12], "%a, %d %b %Y %H:%M:%S") for part in mail.walk(): if (part.get_content_maintype() == "multipart" or part.get("Content-Disposition") is None): continue filename = part.get_filename() if bool(filename) and filename.endswith(".zip"): filename = "{0}-{1}_{2}-{3}.zip".format( "cbs_data", mtime.date(), mtime.hour, mtime.minute) filepath = os.path.join(detach_dir, filename) if os.path.isfile(filepath): break total += 1 print("Currently loading: " + filename + " ") sys.stdout.write("\033[F") time.sleep(0.1) with open(filepath, "wb") as fp: fp.write(part.get_payload(decode=True)) file_found = True if file_found: break logging.info("Imported {0} file(s) in {1}".format( total, time_delta(started))) imapsession.close() imapsession.logout() return filepath except Exception as _: pass # Todo - send an error email to anyway email
def main( batch_size, source, load_start_year=None, ): try: total = 0 started = datetime.now() if source == "s3": if load_start_year is None: now = datetime.now() load_start_year = now.year - 1 logging.info("Importing data from s3...") s3_data_retriever = S3DataRetriever() s3_data_retriever.get_files_from_s3(start_year=load_start_year) delete_cbs_entries(load_start_year, batch_size) for provider_code in [ BE_CONST.CBS_ACCIDENT_TYPE_1_CODE, BE_CONST.CBS_ACCIDENT_TYPE_3_CODE, ]: # TODO: make sure that code does not break if end year does not exist in s3 for year in range(int(load_start_year), s3_data_retriever.current_year + 1): cbs_files_dir = os.path.join( s3_data_retriever.local_files_directory, ACCIDENTS_TYPE_PREFIX + "_" + str(provider_code), str(year), ) logging.info("Importing Directory " + cbs_files_dir) preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir) total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size) shutil.rmtree(s3_data_retriever.local_temp_directory) elif source == "local_dir_for_tests_only": path = "static/data/cbs" import_ui = ImporterUI(path) dir_name = import_ui.source_path() dir_list = glob.glob("{0}/*/*".format(dir_name)) # wipe all the AccidentMarker and Vehicle and Involved data first if import_ui.is_delete_all(): truncate_tables(db, (Vehicle, Involved, AccidentMarker)) for directory in sorted(dir_list, reverse=False): directory_name = os.path.basename(os.path.normpath(directory)) year = directory_name[1:5] if directory_name[0] == "H" else directory_name[0:4] parent_directory = os.path.basename( os.path.dirname(os.path.join(os.pardir, directory)) ) provider_code = get_provider_code(parent_directory) logging.info("Importing Directory " + directory) total += import_to_datastore(directory, provider_code, int(year), batch_size) fill_db_geo_data() failed = [ "\t'{0}' ({1})".format(directory, fail_reason) for directory, fail_reason in failed_dirs.items() ] logging.info( "Finished processing all directories{0}{1}".format( ", except:\n" if failed else "", "\n".join(failed) ) ) logging.info("Total: {0} items in {1}".format(total, time_delta(started))) create_tables() except Exception as ex: print("Exception occured while loading the cbs data: {0}".format(str(ex))) print("Traceback: {0}".format(traceback.format_exc()))
def main( specific_folder, delete_all, path, batch_size, delete_start_date, load_start_year, from_email, username="", password="", email_search_start_date="", from_s3=False, ): try: if not from_email and not from_s3: import_ui = ImporterUI(path, specific_folder, delete_all) dir_name = import_ui.source_path() if specific_folder: dir_list = [dir_name] else: dir_list = glob.glob("{0}/*/*".format(dir_name)) # wipe all the AccidentMarker and Vehicle and Involved data first if import_ui.is_delete_all(): truncate_tables(db, (Vehicle, Involved, AccidentMarker)) elif delete_start_date is not None: delete_cbs_entries(delete_start_date, batch_size) started = datetime.now() total = 0 for directory in sorted(dir_list, reverse=False): directory_name = os.path.basename(os.path.normpath(directory)) year = directory_name[1:5] if directory_name[0] == "H" else directory_name[0:4] if int(year) >= int(load_start_year): parent_directory = os.path.basename( os.path.dirname(os.path.join(os.pardir, directory)) ) provider_code = get_provider_code(parent_directory) logging.info("Importing Directory " + directory) total += import_to_datastore(directory, provider_code, int(year), batch_size) else: logging.info( "Importing only starting year {0}. Directory {1} has year {2}".format( load_start_year, directory_name, year ) ) elif from_s3: logging.info("Importing data from s3...") s3_handler = S3Handler() s3_handler.get_files_from_s3(start_year=load_start_year) """ Should be soon implemented as "delete_entries_from_S3" """ # delete_cbs_entries_from_email(provider_code, year, batch_size) if delete_start_date is not None: delete_cbs_entries(delete_start_date, batch_size) started = datetime.now() total = 0 for provider_code in [ BE_CONST.CBS_ACCIDENT_TYPE_1_CODE, BE_CONST.CBS_ACCIDENT_TYPE_3_CODE, ]: for year in range(int(load_start_year), s3_handler.current_year + 1): cbs_files_dir = os.path.join( s3_handler.local_files_directory, ACCIDENTS_TYPE_PREFIX + "_" + str(provider_code), str(year), ) logging.info("Importing Directory " + cbs_files_dir) preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir) acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data( cbs_files_dir ) total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size) shutil.rmtree(s3_handler.local_temp_directory) else: logging.info("Importing data from mail...") temp_dir = tempfile.mkdtemp() zip_path = importmail_cbs.main(temp_dir, username, password, email_search_start_date) if zip_path is None: logging.info("No new cbs files found") return zip_ref = zipfile.ZipFile(zip_path, "r") cbs_files_dir = os.path.join(temp_dir, "cbsfiles") if not os.path.exists(cbs_files_dir): os.makedirs(cbs_files_dir) zip_ref.extractall(cbs_files_dir) zip_ref.close() preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir) acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data(cbs_files_dir) provider_code, year = get_file_type_and_year(acc_data_file_path) delete_cbs_entries_from_email(provider_code, year, batch_size) started = datetime.now() total = 0 logging.info("Importing Directory " + cbs_files_dir) total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size) shutil.rmtree(temp_dir) fill_db_geo_data() failed = [ "\t'{0}' ({1})".format(directory, fail_reason) for directory, fail_reason in failed_dirs.items() ] logging.info( "Finished processing all directories{0}{1}".format( ", except:\n" if failed else "", "\n".join(failed) ) ) logging.info("Total: {0} items in {1}".format(total, time_delta(started))) create_tables() except Exception as ex: print("Exception occured while loading the cbs data: {0}".format(str(ex))) print("Traceback: {0}".format(traceback.format_exc()))
def main(path): started = datetime.now() total = import_to_datastore(path, 100) logging.info("Total: {0} traffic data rows in {1}".format( total, time_delta(started)))