コード例 #1
0
ファイル: experiments.py プロジェクト: mlazarodominguez/syn
def get_all_experiments():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # MongoDB data.
    mongodb_client: MongoClient = get_data_client()
    db = mongodb_client[os.environ.get('TASKS_DATABASE_NAME',
                                       DevelopmentConfig.TASKS_DATABASE_NAME)]
    col = db[os.environ.get('EXPERIMENTS_COLLECTION_NAME',
                            DevelopmentConfig.EXPERIMENTS_COLLECTION_NAME)]
    query = {}
    projection = {
        '_id': 0,
        'task': '$task_action.kwargs.dataset.task',
        'corpus': '$task_action.kwargs.dataset.corpus',
        'description': '$task_name',
        'task_id': 1
    }
    data = col.find(query, projection)
    result = list(data)

    final_time = time.time()
    log.info(f"Total execution time = {final_time - initial_time} seconds")

    if len(result) > 0:
        return response_with(resp.SUCCESS_200, value={"result": result})
    else:
        return response_with(resp.SERVER_ERROR_404, value={"result": result})
コード例 #2
0
ファイル: experiments.py プロジェクト: mlazarodominguez/syn
def get_experiment_metrics(experiment_id):
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # MongoDB data.
    mongodb_client: MongoClient = get_data_client()
    db = mongodb_client[os.environ.get('TASKS_DATABASE_NAME',
                                       DevelopmentConfig.TASKS_DATABASE_NAME)]
    col = db[os.environ.get('EXPERIMENTS_COLLECTION_NAME',
                            DevelopmentConfig.EXPERIMENTS_COLLECTION_NAME)]
    query = {
        'task_id': str(experiment_id),
        'task_action.evaluation.metrics': {
            '$exists': True
        }
    }
    projection = {'_id': 0, 'metrics': '$task_action.evaluation.metrics'}
    data = col.find(query, projection)
    result = list(data)
    print(len(result))

    final_time = time.time()
    log.info(f"Total execution time = {final_time - initial_time} seconds")

    if len(result) > 0:
        return response_with(resp.SUCCESS_200, value={"result": result})
    else:
        return response_with(resp.SERVER_ERROR_404, value={"result": result})
コード例 #3
0
ファイル: experiments.py プロジェクト: mlazarodominguez/syn
def get_distinct_experiments_by_task():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # MongoDB data.
    mongodb_client: MongoClient = get_data_client()
    db = mongodb_client[os.environ.get('TASKS_DATABASE_NAME',
                                       DevelopmentConfig.TASKS_DATABASE_NAME)]
    col = db[os.environ.get('EXPERIMENTS_COLLECTION_NAME',
                            DevelopmentConfig.EXPERIMENTS_COLLECTION_NAME)]
    data = col.distinct('task_action.kwargs.dataset.task')
    result = []
    for task in data:
        result.append({
            'task_name': get_task_type_text(task, 'es'),
            'task': task
        })

    final_time = time.time()
    log.info(f"Total execution time = {final_time - initial_time} seconds")

    if len(result) > 0:
        return response_with(resp.SUCCESS_200, value={"result": result})
    else:
        return response_with(resp.SERVER_ERROR_404, value={"result": result})
コード例 #4
0
ファイル: features.py プロジェクト: mlazarodominguez/syn
def get_features_values(feature, corpus):
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # MongoDB data.
    mongodb_client: MongoClient = get_data_client()
    db = mongodb_client[corpus]
    col = db[f"{feature}_codes"]
    query = {}
    projection = {'_id': 0, feature: 1}
    data = col.find(query, projection)
    result = []
    for doc in list(data):
        result.append(doc[feature])
    result.sort()

    final_time = time.time()
    log.info(f"Total execution time = {final_time - initial_time} seconds")

    if len(result) > 0:
        return response_with(resp.SUCCESS_200, value={"result": result})
    else:
        return response_with(resp.SERVER_ERROR_404, value={"result": result})
コード例 #5
0
ファイル: experiments.py プロジェクト: mlazarodominguez/syn
def predict(experiment_id):
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Load saved task.
    mongodb_client: MongoClient = get_data_client()
    db = mongodb_client[os.environ.get('TASKS_DATABASE_NAME',
                                       DevelopmentConfig.TASKS_DATABASE_NAME)]
    col = db[os.environ.get('EXPERIMENTS_COLLECTION_NAME',
                            DevelopmentConfig.EXPERIMENTS_COLLECTION_NAME)]
    query = {
        'task_id': str(experiment_id),
        'task_action.kwargs': {
            '$exists': True
        },
        'task_action.train': {
            '$exists': True
        }
    }
    projection = {
        '_id': 0,
        'kwargs': '$task_action.kwargs',
        'model_meta_file': '$task_action.train.model_meta_file'
    }
    data = col.find_one(query, projection)
    kwargs = data['kwargs']
    model_meta_file = data['model_meta_file']

    saved_params = {}
    if model_meta_file is not None:
        # Change path to Docker container.
        if os.name == 'posix':
            model_meta_file = model_meta_file.replace(
                '/datadrive/host-mounted-volumes/syn/', '/usr/src/')
        log.info(
            f"Loading hyperparameters from: '{str(Path(model_meta_file))}'.")
        try:
            saved_params = np.load(str(Path(model_meta_file)),
                                   allow_pickle=True).item()
        except FileNotFoundError:
            return response_with(resp.SERVER_ERROR_404, value={"result": []})

    # Get collapsed unary binary trees
    collapsed_unary_binary_trees = get_collapsed_unary_binary_trees(
        normalize_incidence(str(request.json['description']),
                            to_lower_case=True))

    if len(collapsed_unary_binary_trees) == 0:
        return response_with(resp.INVALID_INPUT_422, value={"result": []})

    # Get attention vectors
    attention_vectors = get_attention_vectors(collapsed_unary_binary_trees)

    # Data
    data_columns = get_task_features_column_names(kwargs['dataset']['task'])
    dataset = pd.DataFrame(columns=data_columns)

    request_args = [i for i in request.json.keys()]
    for column in get_task_request_params_names(kwargs['dataset']['task']):
        if column in request_args:
            dataset.at[0, column] = request.json[column]

    dataset.at[0, 'trees'] = collapsed_unary_binary_trees
    dataset.at[0, 'attention_vectors'] = attention_vectors

    # if task is 'duplicity' or 'similarity' need do more actions.
    query_limit = 1000
    if 'num_issues_to_compare' in request_args:
        query_limit = request.json['num_issues_to_compare']
    if kwargs['dataset']['task'] in ['duplicity', 'similarity']:
        dataset = get_pairs_dataset(dataset, kwargs['dataset']['task'],
                                    kwargs['dataset']['corpus'],
                                    query_limit).copy()

    # Encode structured info.
    dataset = encode_dataset_structured_data(dataset,
                                             kwargs['dataset']['corpus'],
                                             kwargs['dataset']['task'])

    # Format dataset
    inst = format_dataset(dataset, kwargs['dataset']['task'],
                          kwargs['dataset']['corpus'])

    # Word embeddings.
    embeddings_dir = Path(
        os.environ.get('DATA_PATH',
                       DevelopmentConfig.DATA_PATH)) / 'word_embeddings'
    if saved_params['embeddings_pretrained'] or 'glove' == saved_params[
            'embeddings_model']:
        embeddings_filename = \
            get_filtered_word_embeddings_filename(
                model=saved_params['embeddings_model'],
                size=saved_params['embeddings_size']
            )
    else:
        embeddings_filename = f"{saved_params['embeddings_model']}-{kwargs['dataset']['corpus']}-" \
                              f"{saved_params['embeddings_size']}.txt"

    embeddings_path = Path(embeddings_dir) / saved_params[
        'embeddings_model'] / embeddings_filename
    # Change path to Docker container.
    if os.name == 'posix':
        embeddings_path = str(embeddings_path).replace(
            '/datadrive/host-mounted-volumes/syn/data/', '/usr/src/')
    if not os.path.isfile(embeddings_path):
        log.error(
            f"No such filtered word embeddings file: '{embeddings_path}'.")
    assert os.path.isfile(
        embeddings_path), 'Ensure word embeddings file exists.'
    log.info(f"Reading embeddings from '{embeddings_path}' ...")
    word_embed, w2i = get_embeddings(embeddings_path,
                                     saved_params['embeddings_size'])

    # Load model
    model_builder = get_dynet_model(kwargs['dataset']['task'])

    try:
        model = model_builder(n_classes=saved_params['n_classes'],
                              w2i=w2i,
                              word_embed=word_embed,
                              params=saved_params,
                              model_meta_file=model_meta_file)
    except FileNotFoundError:
        return response_with(resp.SERVER_ERROR_404, value={"result": []})

    # build graph for this instance
    dy.renew_cg()
    result = []
    df_result = pd.DataFrame(columns=['bug_id', 'predidct_proba'])
    if kwargs['dataset']['task'] not in ['duplicity', 'similarity']:
        # Check data integrity.
        check_data_integrity(inst[0], inst[1])

        # Issue description as Tuple(trees, attention_vectors).
        issue_description = (inst[0], inst[1])

        # Issue structured data.
        issue_structured_data = inst[2]

        pred, predict_proba, _ = model.predict(issue_description,
                                               issue_structured_data)

        label = get_label_column_name(kwargs['dataset']['task'],
                                      kwargs['dataset']['corpus'])
        codes = get_task_structured_data_codes(kwargs['dataset']['corpus'],
                                               label)

        result.append(
            {'pred': list(codes.keys())[list(codes.values()).index(pred)]})
    else:
        for i, pair in enumerate(tqdm(inst, total=len(inst), desc='rows')):
            # Tuple(inst[0], inst[1], inst[2], inst[3], inst[4], inst[5], inst[6]) =
            # Tuple(trees_left, trees_right, attention_vectors_left, attention_vectors_right, structured_data_left,
            # structured_data_right, label).

            # build graph for this instance
            dy.renew_cg()

            # Check data integrity.
            check_data_integrity(pair[0], pair[2])
            check_data_integrity(pair[1], pair[3])

            # Issue description as Tuple(trees, attention_vectors).
            issue_description_left = (pair[0], pair[2])
            issue_description_right = (pair[1], pair[3])

            # Issue structured data.
            issue_structured_data_left = pair[4]
            issue_structured_data_right = pair[5]

            pred, predict_proba, _, _ = model.predict(
                issue_description_left, issue_description_right,
                issue_structured_data_left, issue_structured_data_right)

            if pred == 1:
                df_result.at[i, 'bug_id'] = pair[6]
                df_result.at[i, 'predidct_proba'] = predict_proba[pred]

        max_num_predictions = 5
        if 'max_num_predictions' in request_args:
            max_num_predictions = request.json['max_num_predictions']
        sorted_df_result = df_result.sort_values('predidct_proba',
                                                 ascending=False).copy()
        limited_sorted_df_result = sorted_df_result.head(
            max_num_predictions).copy()
        for index, row in limited_sorted_df_result.iterrows():
            result.append({
                'bug_id': int(row['bug_id']),
                'predidct_proba': float(row['predidct_proba'])
            })
        log.info(f"result = {result}")

    final_time = time.time()
    log.info(f"Total execution time = {final_time - initial_time} seconds")

    return response_with(resp.SUCCESS_200, value={"result": result})
コード例 #6
0
ファイル: pruebas_debug.py プロジェクト: mlazarodominguez/syn
log = set_logger()

dy = get_dynet()

# task_id = 'RxDOPRfH0qDB'
# task_id = 'UjlhAvfoiita'
# task_id = 'K3+KNvaqbyl5'
# task_id = '52exjMrbQK_Z'
task_id = 'gOi_rcoZknQM'

# Stores the execution start time to calculate the time it takes for the module to execute.
initial_time = time.time()

# MongoDB data.
mongodb_client: MongoClient = get_data_client()
db = mongodb_client[os.environ.get('TASKS_DATABASE_NAME',
                                   DevelopmentConfig.TASKS_DATABASE_NAME)]
col = db[os.environ.get('EXPERIMENTS_COLLECTION_NAME',
                        DevelopmentConfig.EXPERIMENTS_COLLECTION_NAME)]
query = {
    'task_id': str(task_id),
    'task_action.kwargs': {
        '$exists': True
    },
    'task_action.train': {
        '$exists': True
    }
}
projection = {
    '_id': 0,