def get_all_experiments(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # MongoDB data. mongodb_client: MongoClient = get_data_client() db = mongodb_client[os.environ.get('TASKS_DATABASE_NAME', DevelopmentConfig.TASKS_DATABASE_NAME)] col = db[os.environ.get('EXPERIMENTS_COLLECTION_NAME', DevelopmentConfig.EXPERIMENTS_COLLECTION_NAME)] query = {} projection = { '_id': 0, 'task': '$task_action.kwargs.dataset.task', 'corpus': '$task_action.kwargs.dataset.corpus', 'description': '$task_name', 'task_id': 1 } data = col.find(query, projection) result = list(data) final_time = time.time() log.info(f"Total execution time = {final_time - initial_time} seconds") if len(result) > 0: return response_with(resp.SUCCESS_200, value={"result": result}) else: return response_with(resp.SERVER_ERROR_404, value={"result": result})
def get_experiment_metrics(experiment_id): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # MongoDB data. mongodb_client: MongoClient = get_data_client() db = mongodb_client[os.environ.get('TASKS_DATABASE_NAME', DevelopmentConfig.TASKS_DATABASE_NAME)] col = db[os.environ.get('EXPERIMENTS_COLLECTION_NAME', DevelopmentConfig.EXPERIMENTS_COLLECTION_NAME)] query = { 'task_id': str(experiment_id), 'task_action.evaluation.metrics': { '$exists': True } } projection = {'_id': 0, 'metrics': '$task_action.evaluation.metrics'} data = col.find(query, projection) result = list(data) print(len(result)) final_time = time.time() log.info(f"Total execution time = {final_time - initial_time} seconds") if len(result) > 0: return response_with(resp.SUCCESS_200, value={"result": result}) else: return response_with(resp.SERVER_ERROR_404, value={"result": result})
def get_distinct_experiments_by_task(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # MongoDB data. mongodb_client: MongoClient = get_data_client() db = mongodb_client[os.environ.get('TASKS_DATABASE_NAME', DevelopmentConfig.TASKS_DATABASE_NAME)] col = db[os.environ.get('EXPERIMENTS_COLLECTION_NAME', DevelopmentConfig.EXPERIMENTS_COLLECTION_NAME)] data = col.distinct('task_action.kwargs.dataset.task') result = [] for task in data: result.append({ 'task_name': get_task_type_text(task, 'es'), 'task': task }) final_time = time.time() log.info(f"Total execution time = {final_time - initial_time} seconds") if len(result) > 0: return response_with(resp.SUCCESS_200, value={"result": result}) else: return response_with(resp.SERVER_ERROR_404, value={"result": result})
def get_features_values(feature, corpus): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # MongoDB data. mongodb_client: MongoClient = get_data_client() db = mongodb_client[corpus] col = db[f"{feature}_codes"] query = {} projection = {'_id': 0, feature: 1} data = col.find(query, projection) result = [] for doc in list(data): result.append(doc[feature]) result.sort() final_time = time.time() log.info(f"Total execution time = {final_time - initial_time} seconds") if len(result) > 0: return response_with(resp.SUCCESS_200, value={"result": result}) else: return response_with(resp.SERVER_ERROR_404, value={"result": result})
def predict(experiment_id): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Load saved task. mongodb_client: MongoClient = get_data_client() db = mongodb_client[os.environ.get('TASKS_DATABASE_NAME', DevelopmentConfig.TASKS_DATABASE_NAME)] col = db[os.environ.get('EXPERIMENTS_COLLECTION_NAME', DevelopmentConfig.EXPERIMENTS_COLLECTION_NAME)] query = { 'task_id': str(experiment_id), 'task_action.kwargs': { '$exists': True }, 'task_action.train': { '$exists': True } } projection = { '_id': 0, 'kwargs': '$task_action.kwargs', 'model_meta_file': '$task_action.train.model_meta_file' } data = col.find_one(query, projection) kwargs = data['kwargs'] model_meta_file = data['model_meta_file'] saved_params = {} if model_meta_file is not None: # Change path to Docker container. if os.name == 'posix': model_meta_file = model_meta_file.replace( '/datadrive/host-mounted-volumes/syn/', '/usr/src/') log.info( f"Loading hyperparameters from: '{str(Path(model_meta_file))}'.") try: saved_params = np.load(str(Path(model_meta_file)), allow_pickle=True).item() except FileNotFoundError: return response_with(resp.SERVER_ERROR_404, value={"result": []}) # Get collapsed unary binary trees collapsed_unary_binary_trees = get_collapsed_unary_binary_trees( normalize_incidence(str(request.json['description']), to_lower_case=True)) if len(collapsed_unary_binary_trees) == 0: return response_with(resp.INVALID_INPUT_422, value={"result": []}) # Get attention vectors attention_vectors = get_attention_vectors(collapsed_unary_binary_trees) # Data data_columns = get_task_features_column_names(kwargs['dataset']['task']) dataset = pd.DataFrame(columns=data_columns) request_args = [i for i in request.json.keys()] for column in get_task_request_params_names(kwargs['dataset']['task']): if column in request_args: dataset.at[0, column] = request.json[column] dataset.at[0, 'trees'] = collapsed_unary_binary_trees dataset.at[0, 'attention_vectors'] = attention_vectors # if task is 'duplicity' or 'similarity' need do more actions. query_limit = 1000 if 'num_issues_to_compare' in request_args: query_limit = request.json['num_issues_to_compare'] if kwargs['dataset']['task'] in ['duplicity', 'similarity']: dataset = get_pairs_dataset(dataset, kwargs['dataset']['task'], kwargs['dataset']['corpus'], query_limit).copy() # Encode structured info. dataset = encode_dataset_structured_data(dataset, kwargs['dataset']['corpus'], kwargs['dataset']['task']) # Format dataset inst = format_dataset(dataset, kwargs['dataset']['task'], kwargs['dataset']['corpus']) # Word embeddings. embeddings_dir = Path( os.environ.get('DATA_PATH', DevelopmentConfig.DATA_PATH)) / 'word_embeddings' if saved_params['embeddings_pretrained'] or 'glove' == saved_params[ 'embeddings_model']: embeddings_filename = \ get_filtered_word_embeddings_filename( model=saved_params['embeddings_model'], size=saved_params['embeddings_size'] ) else: embeddings_filename = f"{saved_params['embeddings_model']}-{kwargs['dataset']['corpus']}-" \ f"{saved_params['embeddings_size']}.txt" embeddings_path = Path(embeddings_dir) / saved_params[ 'embeddings_model'] / embeddings_filename # Change path to Docker container. if os.name == 'posix': embeddings_path = str(embeddings_path).replace( '/datadrive/host-mounted-volumes/syn/data/', '/usr/src/') if not os.path.isfile(embeddings_path): log.error( f"No such filtered word embeddings file: '{embeddings_path}'.") assert os.path.isfile( embeddings_path), 'Ensure word embeddings file exists.' log.info(f"Reading embeddings from '{embeddings_path}' ...") word_embed, w2i = get_embeddings(embeddings_path, saved_params['embeddings_size']) # Load model model_builder = get_dynet_model(kwargs['dataset']['task']) try: model = model_builder(n_classes=saved_params['n_classes'], w2i=w2i, word_embed=word_embed, params=saved_params, model_meta_file=model_meta_file) except FileNotFoundError: return response_with(resp.SERVER_ERROR_404, value={"result": []}) # build graph for this instance dy.renew_cg() result = [] df_result = pd.DataFrame(columns=['bug_id', 'predidct_proba']) if kwargs['dataset']['task'] not in ['duplicity', 'similarity']: # Check data integrity. check_data_integrity(inst[0], inst[1]) # Issue description as Tuple(trees, attention_vectors). issue_description = (inst[0], inst[1]) # Issue structured data. issue_structured_data = inst[2] pred, predict_proba, _ = model.predict(issue_description, issue_structured_data) label = get_label_column_name(kwargs['dataset']['task'], kwargs['dataset']['corpus']) codes = get_task_structured_data_codes(kwargs['dataset']['corpus'], label) result.append( {'pred': list(codes.keys())[list(codes.values()).index(pred)]}) else: for i, pair in enumerate(tqdm(inst, total=len(inst), desc='rows')): # Tuple(inst[0], inst[1], inst[2], inst[3], inst[4], inst[5], inst[6]) = # Tuple(trees_left, trees_right, attention_vectors_left, attention_vectors_right, structured_data_left, # structured_data_right, label). # build graph for this instance dy.renew_cg() # Check data integrity. check_data_integrity(pair[0], pair[2]) check_data_integrity(pair[1], pair[3]) # Issue description as Tuple(trees, attention_vectors). issue_description_left = (pair[0], pair[2]) issue_description_right = (pair[1], pair[3]) # Issue structured data. issue_structured_data_left = pair[4] issue_structured_data_right = pair[5] pred, predict_proba, _, _ = model.predict( issue_description_left, issue_description_right, issue_structured_data_left, issue_structured_data_right) if pred == 1: df_result.at[i, 'bug_id'] = pair[6] df_result.at[i, 'predidct_proba'] = predict_proba[pred] max_num_predictions = 5 if 'max_num_predictions' in request_args: max_num_predictions = request.json['max_num_predictions'] sorted_df_result = df_result.sort_values('predidct_proba', ascending=False).copy() limited_sorted_df_result = sorted_df_result.head( max_num_predictions).copy() for index, row in limited_sorted_df_result.iterrows(): result.append({ 'bug_id': int(row['bug_id']), 'predidct_proba': float(row['predidct_proba']) }) log.info(f"result = {result}") final_time = time.time() log.info(f"Total execution time = {final_time - initial_time} seconds") return response_with(resp.SUCCESS_200, value={"result": result})
log = set_logger() dy = get_dynet() # task_id = 'RxDOPRfH0qDB' # task_id = 'UjlhAvfoiita' # task_id = 'K3+KNvaqbyl5' # task_id = '52exjMrbQK_Z' task_id = 'gOi_rcoZknQM' # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # MongoDB data. mongodb_client: MongoClient = get_data_client() db = mongodb_client[os.environ.get('TASKS_DATABASE_NAME', DevelopmentConfig.TASKS_DATABASE_NAME)] col = db[os.environ.get('EXPERIMENTS_COLLECTION_NAME', DevelopmentConfig.EXPERIMENTS_COLLECTION_NAME)] query = { 'task_id': str(task_id), 'task_action.kwargs': { '$exists': True }, 'task_action.train': { '$exists': True } } projection = { '_id': 0,