Ejemplo n.º 1
0
def build_srl(configs):
    client = MongoClient(config.MONGO_IP, config.MONGO_PORT)
    db = client[config.DB]
    wikipedia = db[config.WIKIMERGE_COLLECTION]
    documents_id = list(wikipedia.find({}, {"id": 1, "_id": 0}).sort("id"))
    client.close()
    start_time = time.time()
    total = 0
    total_extracted = 0
    total_skipped = 0
    chunks = get_chunks(documents_id, config.CHUNK_SIZE, 'id')
    if config.NUM_WORKERS == 1:
        for chunk in chunks:
            build(chunk, {})
    else:
        pool = multiprocessing.Pool(config.NUM_WORKERS)

        for res in pool.imap(partial(build, configs=configs), chunks):
            total += res['processed']
            if 'extracted' in res:
                total_extracted += res['extracted']
                total_skipped += res['skipped']
                res['total_extracted'] = total_extracted
                res['total_skipped'] = total_skipped
                res['total'] = total
                elapsed = int(time.time() - start_time)
                res['total_elapsed'] = compress(elapsed)
                res['elapsed'] = compress(res['elapsed'])
                logging.info(', '.join("{!s}={!r}".format(key, val) for key, val in res.items()))

        pool.terminate()
    elapsed = int(time.time() - start_time)
    logging.info("Processed {} documents in {} - Total extracted {}".format(total, compress(elapsed), total_extracted))
    return
Ejemplo n.º 2
0
def run_qa():
    client = MongoClient(config.MONGO_IP, config.MONGO_PORT)
    db = client[config.DB]
    wikipedia = db[config.WIKIPEDIA_COLLECTION]
    wikidocs = list(
        wikipedia.find({}, {
            'wikidata_id': 1,
            '_id': 0
        }).sort('wikidata_id'))
    chunks = get_chunks(wikidocs, config.CHUNK_SIZE, 'wikidata_id')
    del wikidocs
    start_time = time.time()
    total = 0

    pool = multiprocessing.Pool(config.NUM_WORKERS)
    for res in pool.imap(qa, chunks):
        total += res['processed']
        res['total'] = total
        part = int(time.time() - start_time)
        res['elapsed'] = compress(res['elapsed'])
        res['total_elapsed'] = compress(part)
        logging.info(
            "Processed {processed} ({total} in total) documents in {elapsed} (running time {"
            "total_elapsed})".format(**res))

    pool.terminate()

    elapsed = int(time.time() - start_time)
    logging.info("Processed {} documents in {}".format(total,
                                                       compress(elapsed)))
    return
Ejemplo n.º 3
0
def aws(sample=False):
    if sample:
        return Section('no AWS environment', 'WHITE', '#333')

    section = Section('no AWS environment', 'WHITE', '#333')
    if 'AWS_PROFILE' not in _env:
        return section

    section = Section('AWS: %s ' % _env['AWS_PROFILE'], 'GREEN', '#333')

    if _env.get('AWS_SESSIONS') and _env['AWS_PROFILE'] in _env['AWS_SESSIONS']:
        try:
            remaining = _env['AWS_SESSIONS'][_env['AWS_PROFILE']
                                             ][_EXPIRATION] - int(time.time())
            if remaining > 300:
                section = Section('AWS: %s(%s) ' % (_env['AWS_PROFILE'], compress(remaining)),
                                  'GREEN', '#333')
            else:
                section = Section('AWS: %s(%s) ' % (_env['AWS_PROFILE'], compress(remaining)),
                                  'GREEN', 'RED')
        except:
            pass
    return section
Ejemplo n.º 4
0
def validate_data(dataset_path: Path):
    from natural.date import compress
    from datetime import timedelta

    for mf_type in ["train_manifest.json", "test_manifest.json"]:
        data_file = dataset_path / Path(mf_type)
        print(f"validating {data_file}.")
        with Path(data_file).open("r") as pf:
            data_jsonl = pf.readlines()
        duration = 0
        for (i, s) in enumerate(data_jsonl):
            try:
                d = json.loads(s)
                duration += d["duration"]
                audio_file = data_file.parent / Path(d["audio_filepath"])
                if not audio_file.exists():
                    raise OSError(f"File {audio_file} not found")
            except BaseException as e:
                print(f'failed on {i} with "{e}"')
        duration_str = compress(timedelta(seconds=duration), pad=" ")
        print(
            f"no errors found. seems like a valid {mf_type}. contains {duration_str}sec of audio"
        )
Ejemplo n.º 5
0
def compress(value, sign=False, pad=''):
    '''Wrapper for :func:`natural.date.compress`'''
    return date.compress(value, sign, pad)
Ejemplo n.º 6
0
def kfold_validation(train_config: TrainingConfiguration, input_data,
                     input_labels):
    kf = KFold(n_splits=5, shuffle=True)
    fold_scores = []
    current_fold = 1
    start_time = time.time()

    # there is a bug for RTX gpu when I need to set allow_growth to True to run CNN
    reset_keras()
    for train_index, test_index in kf.split(input_data):
        x_train, x_test = input_data[train_index], input_data[test_index]
        y_train, y_test = input_labels.values[
            train_index], input_labels.values[test_index]

        x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                          y_train,
                                                          test_size=0.25)

        if train_config.additional_train_data is not None:
            print("Additional data:",
                  train_config.additional_train_data[0].shape)
            x_train = np.concatenate(
                [x_train, train_config.additional_train_data[0]])
            y_train = np.concatenate(
                [y_train, train_config.additional_train_data[1]])

        train_generator = train_config.generator(x_train,
                                                 y_train,
                                                 batch_size=32)
        x, y = train_generator[0]
        input_shape = get_input_shape(x[0])

        num_classes = y[0].shape[0]
        print("Training shape: ", input_shape)
        print("Prediction classes", num_classes)
        print("Trainig data size:", x_train.shape, "Validation data:",
              x_val.shape, "Test data:", x_test.shape)

        x_val = train_config.load_files(x_val)

        # create 2d conv model
        model = train_config.create_model_func(input_shape, num_classes)
        opt = keras.optimizers.Adam(lr=train_config.learning_rate)
        lr_metric = get_lr_metric(opt)
        model.compile(loss=bce_with_logits,
                      optimizer=opt,
                      metrics=[tf_lwlrap, lr_metric])

        log_folder_name = create_log_dir(train_config)

        callbacks = [
            keras.callbacks.TensorBoard(log_dir=log_folder_name,
                                        histogram_freq=0,
                                        batch_size=32,
                                        write_graph=True,
                                        write_grads=False,
                                        write_images=False,
                                        embeddings_freq=0,
                                        embeddings_layer_names=None,
                                        embeddings_metadata=None,
                                        embeddings_data=None,
                                        update_freq='epoch'),
            EarlyStoppingByLWLRAP(validation_data=(x_val, y_val),
                                  patience=20,
                                  restore_best_weights=True),
            keras.callbacks.ReduceLROnPlateau(
                monitor='val_tf_lwlrap',
                patience=train_config.reduce_lr_patience,
                min_lr=train_config.min_lr,
                factor=train_config.reduce_lr_factor,
                mode='max'),
            keras.callbacks.ModelCheckpoint('./models/{}_{}.h5'.format(
                train_config.model_name, current_fold),
                                            monitor='val_tf_lwlrap',
                                            verbose=1,
                                            save_best_only=True,
                                            mode='max')
        ]

        model.fit_generator(train_generator,
                            epochs=train_config.num_epoch,
                            callbacks=callbacks,
                            validation_data=(x_val, y_val),
                            verbose=2)

        x_test_data = train_config.load_files(x_test)
        y_pred = model.predict(x_test_data)
        lwlrap = calculate_overall_lwlrap_sklearn(y_test, y_pred)
        print("Fold {} Score: {}".format(current_fold, lwlrap))

        # calculate_and_dump_lwlrap_per_class(x_test, y_test, y_pred, "per_class_lwlrap_fold_{}.csv".format(current_fold))
        current_fold += 1
        fold_scores.append(lwlrap)
        break
    print(fold_scores)
    print("Average Fold Score:", np.mean(fold_scores))
    print("Time taken: {}".format(date.compress(time.time() - start_time)))
Ejemplo n.º 7
0
def compress(value, sign=False, pad=u''):
    '''Wrapper for :func:`natural.date.compress`'''
    return date.compress(value, sign, pad)
Ejemplo n.º 8
0
def run_benchmark(config):
    """ Benchmark script for tiledb-vcf"""

    # Open yaml config file
    with open(config, 'r') as stream:
        try:
            benchmarking_start = time.time()
            results = []
            config = yaml.load(stream)
            base_cmd = config['base_command']
            iterations = config['iterations']
            ingestion_files = config['ingestion_files']
            attribute_results = {}
            suite_index = 0
            suite_names = []

            errors = {}

            # Get the size of the files being ingested
            ingestion_size = 0
            for ingestion_file in ingestion_files:
                ingestion_size += os.path.getsize(ingestion_file) / (1024 *
                                                                     1024)

            # Loop through each test suite
            for suite_name, test_set in config['suites'].items():
                suite_names.append(suite_name)
                test_results = {}
                # Run each suite the given number of iterations
                iteration_count = 0
                for i in range(iterations):
                    iteration_count += 1

                    array_uri = test_set['array_uri']
                    group_uri = test_set['group_uri']
                    dir_to_rm = None
                    if 'group_uri' in test_set:
                        dir_to_rm = group_uri
                    else:
                        dir_to_rm = array_uri

                    if not dir_to_rm is None and os.path.isdir(dir_to_rm):
                        shutil.rmtree(dir_to_rm)

                    if not os.path.isdir(group_uri):
                        pathlib.Path(group_uri).mkdir(parents=True,
                                                      exist_ok=True)

                    # Run each test in the suite
                    for test in test_set['tests']:

                        # Flush caches
                        flush_caches()

                        test_name = test["name"]
                        logger.info("Starting test %s - %s iteration %d",
                                    suite_name, test_name, i)

                        # Add specified arguments
                        cmd = [base_cmd] + test['args']
                        # Add group uri argument
                        cmd.extend(["-a", array_uri])

                        # If store or register add ingestion files
                        if test_name == "store" or test_name == "register":
                            cmd.append("-f")
                            cmd.extend(ingestion_files)

                        if test_name == "export":
                            export_path = os.path.join(group_uri, "export")
                            if not os.path.isdir(export_path):
                                os.mkdir(export_path)
                            #cmd.extend(["-p",  export_path + os.path.sep])

                        logger.info("Running: %s", list2cmdline(cmd))

                        # Time and run test command
                        t0 = time.time()
                        t1 = None
                        try:
                            ret = call(cmd)
                            t1 = time.time()
                        except Exception as e:
                            if not suite_name in errors:
                                errors[suite_name] = {"test_name": []}
                            if not test_name in errors[suite_name]:
                                errors[suite_name][test_name] = []
                            errors[suite_name][test_name].append({
                                "iteration":
                                i,
                                "ret_code":
                                ret
                            })
                            logging.error(traceback.format_exc())
                            continue

                        array_size = 0
                        tiledb_file_sizes = None
                        if 'check_array_size' in test and test[
                                'check_array_size']:
                            array_size = get_folder_size(array_uri)
                            tiledb_file_sizes = get_tiledb_file_sizes(
                                array_uri)

                        # Save results
                        if not test_name in test_results:
                            test_results[test_name] = {
                                "time": [],
                                "size": [],
                                "file_sizes": {}
                            }
                        test_results[test_name]["time"].append(t1 - t0)
                        test_results[test_name]["size"].append(array_size)
                        if tiledb_file_sizes != None:
                            for file_name, size in tiledb_file_sizes.items():
                                if not file_name in test_results[test_name][
                                        "file_sizes"]:
                                    test_results[test_name]["file_sizes"][
                                        file_name] = []
                                test_results[test_name]["file_sizes"][
                                    file_name].append(size)

                # If there was a store test we should save results for printing table at the end
                if 'store' in test_results:
                    ingestion_times = test_results["store"]["time"]
                    ingestion_time_avg = numpy.average(ingestion_times)
                    size_avg = numpy.average(
                        test_results["store"]["size"]) / (1024 * 1024)
                    ingestion_time_std = numpy.std(ingestion_times)
                    export_time_avg = 'N/A'
                    export_time_std = 'N/A'

                    if 'export' in test_results:
                        export_times = test_results["export"]["time"]
                        export_time_avg = numpy.average(export_times)
                        export_time_std = numpy.std(export_times)

                    results.append([
                        suite_name, iteration_count, ingestion_time_avg,
                        ingestion_time_std, size_avg, ingestion_size,
                        export_time_avg, export_time_std
                    ])

                    for file_name, file_sizes in test_results['store'][
                            "file_sizes"].items():
                        if not file_name in attribute_results:
                            attribute_results[file_name] = [None] * len(
                                config['suites'])  #{suite_name: 'N/A'}

                        file_size_avg = numpy.average(file_sizes) / (1024 *
                                                                     1024)
                        attribute_results[file_name][
                            suite_index] = file_size_avg

                suite_index += 1

                # Remove directory to save space again
                dir_to_rm = None
                if 'group_uri' in test_set:
                    dir_to_rm = group_uri
                else:
                    dir_to_rm = array_uri

                if not dir_to_rm is None and os.path.isdir(dir_to_rm):
                    shutil.rmtree(dir_to_rm)

            header = [
                'Test', 'Iterations', 'Ingestion Time (seconds)',
                'Ingestion Time (seconds) STDDEV', 'Array Size (MB)',
                'Ingestion Size (MB)', 'Export Time (seconds)',
                'Export Time STDDEV (seconds)'
            ]
            t = PrettyTable(header)
            for result in results:
                t.add_row(result)

            data = ",".join(header) + "\n"
            for result in results:
                data += ",".join(map(str, result)) + "\n"
            logger.info(data)

            print("")
            print(t)

            t = PrettyTable()

            t.add_column("Test", suite_names)

            for file_name, sizes in attribute_results.items():
                t.add_column(file_name, sizes)

            #for result in attribute_results:
            #    print(result)
            #    t.add_row(result)
            #for index in range(len(suite_names)):
            #    results = [] #[None] * len(attribute_results)
            #    for file_name, result in attribute_results.items():
            #        results.append(result[index])
            #    t.add_column(suite_names[index], results)

            # Set file_name column
            #file_name_results = [] #[None] * len(attribute_results)
            #for file_name, result in attribute_results.items():
            #    file_name_results.append(file_name)
            #t.add_column("file_name", file_name_results)

            print("")
            print(t)

            data = ",".join(t.field_names) + "\n"
            for row in t._get_rows(t._get_options({})):
                data += ",".join(map(str, row)) + "\n"
            logger.info(data)

            logger.info("Total time taken to run benchmark was: %s",
                        date.compress(time.time() - benchmarking_start))

            if errors:
                logger.error("Errors detected in run, dumping details:")
                logger.error(errors)

        except yaml.YAMLError as exc:
            print(exc)