コード例 #1
0
def import_to_datastore(path, batch_size):
    try:
        assert batch_size > 0
        dir_list = glob.glob("{0}/*".format(path))
        for directory in sorted(dir_list, reverse=False):
            started = datetime.now()
            delete_traffic_volume_of_directory(directory)
            traffic_volume_rows = get_traffic_volume_rows(directory)
            new_items = 0
            logging.info("inserting " + str(len(traffic_volume_rows)) +
                         " new traffic data rows")
            for traffic_volume_chunk in chunks(traffic_volume_rows,
                                               batch_size):
                db.session.bulk_insert_mappings(TrafficVolume,
                                                traffic_volume_chunk)
                db.session.commit()
            new_items += len(traffic_volume_rows)
            logging.info("\t{0} items in {1}".format(new_items,
                                                     time_delta(started)))
        db.session.commit()
        return new_items
    except:
        error = ("Traffic Volume import succeeded partially with " +
                 str(new_items) + " traffic data rows")
        raise Exception(error)
コード例 #2
0
def import_to_datastore(directory, provider_code, year, batch_size):
    """
    goes through all the files in a given directory, parses and commits them
    """
    try:
        assert batch_size > 0

        files_from_cbs = get_files(directory)
        if len(files_from_cbs) == 0:
            return 0
        logging.info("Importing '{}'".format(directory))
        started = datetime.now()

        # import dictionary
        fill_dictionary_tables(files_from_cbs[DICTIONARY], provider_code, year)

        new_items = 0
        accidents_count = import_accidents(**files_from_cbs)
        new_items += accidents_count
        involved_count = import_involved(**files_from_cbs)
        new_items += involved_count
        vehicles_count = import_vehicles(**files_from_cbs)
        new_items += vehicles_count

        logging.info("\t{0} items in {1}".format(new_items, time_delta(started)))
        return new_items
    except ValueError as e:
        failed_dirs[directory] = str(e)
        if "Not found" in str(e):
            return 0
        raise (e)
コード例 #3
0
def parse(schools_description_filepath, schools_coordinates_filepath,
          batch_size):
    started = datetime.now()
    total = import_to_datastore(
        schools_description_filepath=schools_description_filepath,
        schools_coordinates_filepath=schools_coordinates_filepath,
        batch_size=batch_size,
    )
    db.session.execute(
        "UPDATE schools_with_description SET geom = ST_SetSRID(ST_MakePoint(longitude,latitude),4326)\
                           WHERE geom IS NULL;")
    logging.info("Total: {0} schools in {1}".format(total,
                                                    time_delta(started)))
コード例 #4
0
def import_to_datastore(filepath, batch_size):
    try:
        assert batch_size > 0
        started = datetime.now()
        schools = get_schools(filepath)
        new_items = 0
        all_existing_schools_ids = set(map(lambda x: x[0], db.session.query(School.id).all()))
        schools = [school for school in schools if school["id"] not in all_existing_schools_ids]
        logging.info("inserting " + str(len(schools)) + " new schools")
        for schools_chunk in chunks(schools, batch_size):
            db.session.bulk_insert_mappings(School, schools_chunk)
            db.session.commit()
        new_items += len(schools)
        logging.info("\t{0} items in {1}".format(new_items, time_delta(started)))
        return new_items
    except:
        error = "Schools import succeded partially with " + new_items + " schools"
        raise Exception(error)
コード例 #5
0
def main(specific_folder, delete_all, path):
    import_ui = ImporterUI(path, specific_folder, delete_all)
    dir_name = import_ui.source_path()

    # wipe all data first
    if import_ui.is_delete_all():
        truncate_tables(db, (RegisteredVehicle, ))

    importer = DatastoreImporter()
    total = 0
    dir_files = glob.glob("{0}/*.csv".format(dir_name))
    started = datetime.now()
    for fname in dir_files:
        total += importer.import_file(fname)

    db.session.commit()
    db.engine.execute(
        "UPDATE {0} SET city_id = (SELECT id FROM {1} WHERE {0}.search_name = {1}.search_heb) WHERE city_id IS NULL"
        .format(RegisteredVehicle.__tablename__, City.__tablename__))
    logging.info("Total: {0} items in {1}".format(total, time_delta(started)))
コード例 #6
0
def import_to_datastore(schools_description_filepath,
                        schools_coordinates_filepath, batch_size):
    try:
        assert batch_size > 0
        started = datetime.now()
        schools = get_schools_with_description(schools_description_filepath,
                                               schools_coordinates_filepath)
        truncate_schools_with_description()
        new_items = 0
        logging.info("inserting " + str(len(schools)) + " new schools")
        for schools_chunk in chunks(schools, batch_size):
            db.session.bulk_insert_mappings(SchoolWithDescription,
                                            schools_chunk)
            db.session.commit()
        new_items += len(schools)
        logging.info("\t{0} items in {1}".format(new_items,
                                                 time_delta(started)))
        return new_items
    except:
        error = "Schools import succeded partially with " + new_items + " schools"
        raise Exception(error)
コード例 #7
0
def parse(filepath, batch_size):
    started = datetime.now()
    total = import_to_datastore(filepath, batch_size)
    logging.info("Total: {0} schools in {1}".format(total, time_delta(started)))
コード例 #8
0
ファイル: importmail_cbs.py プロジェクト: rsperer/anyway
def main(detach_dir, username=None, password=None, email_search_start_date=""):
    try:
        username = username or os.environ.get("MAILUSER")
        password = password or os.environ.get("MAILPASS")
        if not username:
            logging.error(
                "Username not set. Please set env var MAILUSER or use the --username argument"
            )
        if not password:
            logging.error(
                "Password not set. Please set env var MAILPASS or use the --password argument"
            )
        if not username or not password:
            exit()

        imapsession = imaplib.IMAP4_SSL("imap.gmail.com")
        try:
            imapsession.login(username, password)
        except imaplib.IMAP4.error:
            logging.error("Bad credentials, unable to sign in!")
            exit()

        try:
            imapsession.select(mail_dir)
            if email_search_start_date == "":
                typ, data = imapsession.search(None, "ALL")
            else:
                search_start_date = datetime.strptime(
                    email_search_start_date, "%d.%m.%Y").strftime("%d-%b-%Y")
                typ, data = imapsession.search(
                    None, '(SINCE "{0}")'.format(search_start_date))
        except imaplib.IMAP4.error:
            logging.error("Error searching given mailbox: %s" % mail_dir)
            exit()

        file_found = False
        if not os.path.exists(detach_dir):
            os.makedirs(detach_dir)
        total = 0

        # Iterating over all emails
        started = datetime.now()
        logging.info("Login successful! Importing files, please hold...")
        filepath = None
        for msgId in data[0].split():
            typ, message_parts = imapsession.fetch(msgId, "(RFC822)")
            if typ != "OK":
                logging.error("Error fetching mail.")
                raise Exception("Error fetching mail")

            email_body = message_parts[0][1]
            mail = email.message_from_string(email_body)
            try:
                mtime = datetime.strptime(mail["Date"][:-6],
                                          "%a, %d %b %Y %H:%M:%S")
            except ValueError:
                mtime = datetime.strptime(mail["Date"][:-12],
                                          "%a, %d %b %Y %H:%M:%S")

            for part in mail.walk():
                if (part.get_content_maintype() == "multipart"
                        or part.get("Content-Disposition") is None):
                    continue
                filename = part.get_filename()

                if bool(filename) and filename.endswith(".zip"):
                    filename = "{0}-{1}_{2}-{3}.zip".format(
                        "cbs_data", mtime.date(), mtime.hour, mtime.minute)
                    filepath = os.path.join(detach_dir, filename)
                    if os.path.isfile(filepath):
                        break
                    total += 1
                    print("Currently loading: " + filename + "       ")
                    sys.stdout.write("\033[F")
                    time.sleep(0.1)
                    with open(filepath, "wb") as fp:
                        fp.write(part.get_payload(decode=True))
                    file_found = True

            if file_found:
                break

        logging.info("Imported {0} file(s) in {1}".format(
            total, time_delta(started)))
        imapsession.close()
        imapsession.logout()
        return filepath
    except Exception as _:
        pass  # Todo - send an error email to anyway email
コード例 #9
0
def main(
    batch_size,
    source,
    load_start_year=None,
):
    try:
        total = 0
        started = datetime.now()
        if source == "s3":
            if load_start_year is None:
                now = datetime.now()
                load_start_year = now.year - 1
            logging.info("Importing data from s3...")
            s3_data_retriever = S3DataRetriever()
            s3_data_retriever.get_files_from_s3(start_year=load_start_year)
            delete_cbs_entries(load_start_year, batch_size)
            for provider_code in [
                BE_CONST.CBS_ACCIDENT_TYPE_1_CODE,
                BE_CONST.CBS_ACCIDENT_TYPE_3_CODE,
            ]:
                # TODO: make sure that code does not break if end year does not exist in s3
                for year in range(int(load_start_year), s3_data_retriever.current_year + 1):
                    cbs_files_dir = os.path.join(
                        s3_data_retriever.local_files_directory,
                        ACCIDENTS_TYPE_PREFIX + "_" + str(provider_code),
                        str(year),
                    )
                    logging.info("Importing Directory " + cbs_files_dir)
                    preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir)
                    total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size)
            shutil.rmtree(s3_data_retriever.local_temp_directory)

        elif source == "local_dir_for_tests_only":
            path = "static/data/cbs"
            import_ui = ImporterUI(path)
            dir_name = import_ui.source_path()
            dir_list = glob.glob("{0}/*/*".format(dir_name))

            # wipe all the AccidentMarker and Vehicle and Involved data first
            if import_ui.is_delete_all():
                truncate_tables(db, (Vehicle, Involved, AccidentMarker))
            for directory in sorted(dir_list, reverse=False):
                directory_name = os.path.basename(os.path.normpath(directory))
                year = directory_name[1:5] if directory_name[0] == "H" else directory_name[0:4]
                parent_directory = os.path.basename(
                    os.path.dirname(os.path.join(os.pardir, directory))
                )
                provider_code = get_provider_code(parent_directory)
                logging.info("Importing Directory " + directory)
                total += import_to_datastore(directory, provider_code, int(year), batch_size)

        fill_db_geo_data()

        failed = [
            "\t'{0}' ({1})".format(directory, fail_reason)
            for directory, fail_reason in failed_dirs.items()
        ]
        logging.info(
            "Finished processing all directories{0}{1}".format(
                ", except:\n" if failed else "", "\n".join(failed)
            )
        )
        logging.info("Total: {0} items in {1}".format(total, time_delta(started)))
        create_tables()
    except Exception as ex:
        print("Exception occured while loading the cbs data: {0}".format(str(ex)))
        print("Traceback: {0}".format(traceback.format_exc()))
コード例 #10
0
ファイル: executor.py プロジェクト: benvinert/anyway
def main(
    specific_folder,
    delete_all,
    path,
    batch_size,
    delete_start_date,
    load_start_year,
    from_email,
    username="",
    password="",
    email_search_start_date="",
    from_s3=False,
):
    try:
        if not from_email and not from_s3:
            import_ui = ImporterUI(path, specific_folder, delete_all)
            dir_name = import_ui.source_path()

            if specific_folder:
                dir_list = [dir_name]
            else:
                dir_list = glob.glob("{0}/*/*".format(dir_name))

            # wipe all the AccidentMarker and Vehicle and Involved data first
            if import_ui.is_delete_all():
                truncate_tables(db, (Vehicle, Involved, AccidentMarker))
            elif delete_start_date is not None:
                delete_cbs_entries(delete_start_date, batch_size)
            started = datetime.now()
            total = 0
            for directory in sorted(dir_list, reverse=False):
                directory_name = os.path.basename(os.path.normpath(directory))
                year = directory_name[1:5] if directory_name[0] == "H" else directory_name[0:4]
                if int(year) >= int(load_start_year):
                    parent_directory = os.path.basename(
                        os.path.dirname(os.path.join(os.pardir, directory))
                    )
                    provider_code = get_provider_code(parent_directory)
                    logging.info("Importing Directory " + directory)
                    total += import_to_datastore(directory, provider_code, int(year), batch_size)
                else:
                    logging.info(
                        "Importing only starting year {0}. Directory {1} has year {2}".format(
                            load_start_year, directory_name, year
                        )
                    )
        elif from_s3:
            logging.info("Importing data from s3...")
            s3_handler = S3Handler()
            s3_handler.get_files_from_s3(start_year=load_start_year)
            """
            Should be soon implemented as "delete_entries_from_S3"
            """
            # delete_cbs_entries_from_email(provider_code, year, batch_size)
            if delete_start_date is not None:
                delete_cbs_entries(delete_start_date, batch_size)
            started = datetime.now()
            total = 0
            for provider_code in [
                BE_CONST.CBS_ACCIDENT_TYPE_1_CODE,
                BE_CONST.CBS_ACCIDENT_TYPE_3_CODE,
            ]:
                for year in range(int(load_start_year), s3_handler.current_year + 1):
                    cbs_files_dir = os.path.join(
                        s3_handler.local_files_directory,
                        ACCIDENTS_TYPE_PREFIX + "_" + str(provider_code),
                        str(year),
                    )
                    logging.info("Importing Directory " + cbs_files_dir)
                    preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir)
                    acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data(
                        cbs_files_dir
                    )
                    total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size)
            shutil.rmtree(s3_handler.local_temp_directory)
        else:
            logging.info("Importing data from mail...")
            temp_dir = tempfile.mkdtemp()
            zip_path = importmail_cbs.main(temp_dir, username, password, email_search_start_date)
            if zip_path is None:
                logging.info("No new cbs files found")
                return
            zip_ref = zipfile.ZipFile(zip_path, "r")
            cbs_files_dir = os.path.join(temp_dir, "cbsfiles")
            if not os.path.exists(cbs_files_dir):
                os.makedirs(cbs_files_dir)
            zip_ref.extractall(cbs_files_dir)
            zip_ref.close()
            preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir)
            acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data(cbs_files_dir)
            provider_code, year = get_file_type_and_year(acc_data_file_path)
            delete_cbs_entries_from_email(provider_code, year, batch_size)
            started = datetime.now()
            total = 0
            logging.info("Importing Directory " + cbs_files_dir)
            total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size)
            shutil.rmtree(temp_dir)

        fill_db_geo_data()

        failed = [
            "\t'{0}' ({1})".format(directory, fail_reason)
            for directory, fail_reason in failed_dirs.items()
        ]
        logging.info(
            "Finished processing all directories{0}{1}".format(
                ", except:\n" if failed else "", "\n".join(failed)
            )
        )
        logging.info("Total: {0} items in {1}".format(total, time_delta(started)))

        create_tables()
    except Exception as ex:
        print("Exception occured while loading the cbs data: {0}".format(str(ex)))
        print("Traceback: {0}".format(traceback.format_exc()))
コード例 #11
0
def main(path):
    started = datetime.now()
    total = import_to_datastore(path, 100)
    logging.info("Total: {0} traffic data rows in {1}".format(
        total, time_delta(started)))