Example #1
0
def build_similarity_dataset(
        dataset: pd.DataFrame = None,
        corpus: str = 'bugzilla',
        collection_name: str = 'similar_pairs'
) -> pd.DataFrame:
    # Load df_similar_pairs dataframe.
    df_similar_pairs = load_dataframe_from_mongodb(
        database_name=corpus,
        collection_name=collection_name
    )

    # Change bug_id column type.
    dataset['bug_id'] = pd.to_numeric(dataset['bug_id'])

    # Join on column bug1 and bug_id.
    df_pairs_bug1_dataset_bug_id = df_similar_pairs.merge(dataset, left_on='bug1', right_on='bug_id')

    # Join on column bug2 and bug_id.
    result = df_pairs_bug1_dataset_bug_id.merge(dataset, left_on='bug2', right_on='bug_id',
                                                suffixes=('_left', '_right'))

    result.drop(['bug1', 'bug2'], axis=1, inplace=True)

    # Rename column dec as label.
    result.rename(columns={"dec": "label"}, errors="raise", inplace=True)

    return result
Example #2
0
def read_dataset_from_mongodb(
        database_name: str = 'bugzilla',
        collection_name: str = 'duplicity_task_train_dataset',
        query: dict = None,
        projection: dict = None,
        trees_columns: list = None,
        attention_vectors_columns: list = None,
        structured_data_columns: list = None,
        query_limit: int = 0) -> List[Tuple[Union[list, Any], ...]]:
    # List[Tuple[list, Any, Any]]
    # Read MongoDB collection.
    df = load_dataframe_from_mongodb(database_name=database_name,
                                     collection_name=collection_name,
                                     query=query,
                                     projection=projection,
                                     query_limit=query_limit)

    log.info(f"Generating trees ...")
    trees_columns = trees_columns if trees_columns is not None and len(
        trees_columns) > 0 else ['trees']
    attention_vectors_columns = attention_vectors_columns \
        if attention_vectors_columns is not None and len(attention_vectors_columns) > 0 else ['attention_vectors']
    structured_data_columns = structured_data_columns \
        if structured_data_columns is not None and len(structured_data_columns) > 0 else ['structured_data']
    rows = []
    loop = tqdm(range(df.shape[0]), desc='rows')
    for i in loop:
        row_elements = []
        # Columns with trees.
        for column_name in trees_columns:
            row_elements.append(get_trees_from_mongodb(df.at[i, column_name]))

        # Columns with attention vectors.
        for column_name in attention_vectors_columns:
            row_elements.append(df.at[i, column_name])

        # Columns with structured data vectors.
        for column_name in structured_data_columns:
            row_elements.append(df.at[i, column_name])

        # Column with label.
        row_elements.append(df.at[i, 'label'])

        rows.append(tuple(row_elements))

    return rows
Example #3
0
def get_similarity_dataset(
        dataset: pd.DataFrame = None,
        corpus: str = '',
        near_issues: bool = False,
        query_limit: int = 0

) -> pd.DataFrame:
    collection_name = 'similar_pairs' if not near_issues else 'near_pairs'
    df_similar_pairs = load_dataframe_from_mongodb(
        database_name=corpus,
        collection_name=collection_name
    )

    # Sort by creation_ts
    df = df_similar_pairs.sort_values('creation_ts')

    if query_limit > 0:
        df = df.head(query_limit).copy()
    dataset_merged = df.merge(dataset, left_on='bug_id', right_on='bug_id',
                              suffixes=('_left', '_right'))
    return dataset_merged
Example #4
0
    def get_best_models(self, num_models=1):
        query = self._get_experiments_query()
        projection = self._get_best_models_projection()

        # All results are retrieved to limit their number after sorting.
        df = load_dataframe_from_mongodb(database_name=self.save_dbname,
                                         collection_name=self.save_collection,
                                         query=query,
                                         projection=projection,
                                         query_limit=0)

        if df.empty:
            log.error(
                f"No metrics saved in '{self.save_dbname}.{self.save_collection}' for task '{self.task}' "
                f"and corpus '{self.corpus}'.")
            raise ValueError(
                'Ensure database name, collection name, task name and corpus are correct.'
            )

        # Explode MongoDB fields.
        df_model_meta_file = df['model_meta_file']
        df_metrics = pd.json_normalize(df['metrics'])
        exploded_df = pd.concat(
            [df['task_id'], df_model_meta_file, df_metrics], axis=1)

        # Check if metric name exists.
        df_metrics_columns = df_metrics.columns
        self._metric_name_exits(self.objective, list(df_metrics_columns))

        # Sort by
        exploded_df = exploded_df.sort_values(by=self.objective,
                                              ascending=False)

        # Query limit
        if num_models > 0:
            exploded_df = exploded_df.head(num_models)

        # Return id and model_meta_file.
        exploded_df.drop(list(df_metrics_columns), axis=1, inplace=True)
        return exploded_df.to_dict('records')
Example #5
0
def get_pairs_dataset(
        dataset: pd.DataFrame = None,
        task: str = '',
        corpus: str = '',
        query_limit: int = 0
) -> pd.DataFrame:
    projection = get_task_dataset_projection(task)
    # Query only non rejected documents
    query = {'rejected': False}
    df_task_dataset = load_dataframe_from_mongodb(
        database_name=corpus,
        collection_name=f"normalized_clear",
        query=query,
        projection=projection,
        sort_by='creation_ts',
        query_limit=query_limit
    )

    df_task_dataset['bug_id'] = pd.to_numeric(df_task_dataset['bug_id'])
    dataset_merged = dataset.merge(df_task_dataset, how='cross', suffixes=('_left', '_right'))

    return dataset_merged
Example #6
0
def build_duplicity_dataset(dataset: pd.DataFrame = None, corpus: str = 'bugzilla') -> pd.DataFrame:
    # Load pairs dataframe.
    df_pairs = load_dataframe_from_mongodb(
        database_name=corpus,
        collection_name='pairs'
    )

    # Check duplicated pairs.
    log.info(f"Looking for duplicates in pair 'bug1' - 'bug2' ...")
    df_pairs['bug1-bug2'] = df_pairs.apply(lambda x: f"{x['bug1']}-{x['bug2']}", axis=1)
    df_pairs['bug2-bug1'] = df_pairs.apply(lambda x: f"{x['bug2']}-{x['bug1']}", axis=1)
    log.info(f"Rows before drop duplicates: {df_pairs.shape[0]}")
    df_pairs.drop_duplicates(subset='bug1-bug2', keep=False, inplace=True)
    log.info(f"Rows after drop duplicates: {df_pairs.shape[0]}")

    log.info(f"Looking for duplicates 'bug1' - 'bug2' equals to 'bug2' - 'bug1' ...")
    df_pairs['duplicated'] = df_pairs.apply(lambda x: x['bug1-bug2'] == x['bug2-bug1'], axis=1)
    log.info(f"Rows with duplicates pairs: {df_pairs[df_pairs['duplicated']].shape[0]}")

    df_pairs_final = df_pairs[df_pairs['duplicated'] == False].copy()
    log.info(f"Rows after drop all types of duplicates: {df_pairs_final.shape[0]}")

    # Change bug_id column type.
    dataset['bug_id'] = pd.to_numeric(dataset['bug_id'])

    # Join on column bug1 and bug_id.
    df_pairs_bug1_dataset_bug_id = df_pairs_final.merge(dataset, left_on='bug1', right_on='bug_id')

    # Join on column bug2 and bug_id.
    result = df_pairs_bug1_dataset_bug_id.merge(dataset, left_on='bug2', right_on='bug_id',
                                                suffixes=('_left', '_right'))

    result.drop(['bug1', 'bug2'], axis=1, inplace=True)

    # Rename column dec as label.
    result.rename(columns={"dec": "label"}, errors="raise", inplace=True)

    return result
Example #7
0
    log.info(f"Label column name for task '{input_params['task']}' and corpus '{input_params['corpus']}':"
             f" {label_column_name}")

    # Load normalized_clear collection.
    projection = {'_id': 0, 'label': f"${label_column_name}"}

    collection_name = get_label_collection_name(input_params['task'], input_params['corpus'])
    if 'similarity' == input_params['task']:
        if not input_params['near_issues']:
            collection_name = f"similar_{collection_name}"
        else:
            collection_name = f"near_{collection_name}"

    df_labels = load_dataframe_from_mongodb(
        database_name=input_params['corpus'],
        collection_name=collection_name,
        projection=projection
    )

    # Group_by label column.
    labels_value_counts = df_labels['label'].value_counts()
    log.info(f"Number of distinct label values: {labels_value_counts.shape[0]}")

    df_distinct_labels = pd.DataFrame(
        data=labels_value_counts.keys().to_list(),
        columns=['label']
    )

    # converting type of label column to 'category'
    df_distinct_labels['label'] = df_distinct_labels['label'].astype('category')
        'STRUCTURED_DATA_COLUMN_NAMES'].split(',')

    if len(structured_data_column_name) == 0:
        raise ValueError('No structured data column names defined.')
    log.info(f"Structured data column name: {structured_data_column_name}")

    # Load normalized_clear collection.
    projection = {'_id': 0}

    for column in structured_data_column_name:
        projection[column] = 1

    log.info(f"projection:{projection}")

    df_structured_data = load_dataframe_from_mongodb(
        database_name=input_params['corpus'],
        collection_name=input_params['collection_name'],
        projection=projection)

    # Mongo client.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[input_params['corpus']]

    # Group_by structured data column.
    for column in structured_data_column_name:
        df = pd.DataFrame(columns=[column])
        column_value_counts = df_structured_data[column].value_counts()

        log.info(
            f"Number of distinct values in column '{column}': {column_value_counts.shape[0]}"
        )
        "creation_ts": {
            "$gte":
            datetime.datetime(input_params['year'], 1, 1).strftime('%Y-%m-%d'),
            "$lt":
            datetime.datetime(input_params['year'] + 1, 1,
                              1).strftime('%Y-%m-%d')
        }
    }

    if input_params['closed_states']:
        query['bug_status'] = {"$in": ["CLOSED", "RESOLVED", "VERIFIED"]}

    projection = {'_id': 0, 'bug_id': 1, 'assigned_to': 1}
    df_clear = load_dataframe_from_mongodb(
        database_name=input_params['corpus'],
        collection_name='clear',
        query=query,
        projection=projection)

    # Check empty Dataframe.
    if 0 == df_clear.shape[0]:
        raise ValueError(
            f"No documents have been retrieved from '{input_params['corpus']}.clear' collection for the "
            f"year {input_params['year']}")

    # Mongo client.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[input_params['corpus']]
    col_name = 'normalized_clear_updated' if input_params[
        'closed_states'] else 'normalized_clear_all_states'
Example #10
0
    def _aggregate_metrics(self,
                           metric_name: list = None,
                           sort_by: list = None,
                           num_models: int = 0,
                           fields: list = None) -> dict:
        """Aggregate the metrics of all experiments in MongoDB.
        Args:
            metric_name: (list) metrics names
            sort_by: (list) columns to sort by
            num_models: (int) limit of read results from MongoDB

        Example:

            _aggregate_metrics(
                ['accuracy', 'precision_micro', 'recall_micro', 'f1_micro'], ['accuracy'], 1
            )
        """
        query = self._get_experiments_query()
        projection = self._get_experiments_projection()

        # All results are retrieved to limit their number after sorting.
        df = load_dataframe_from_mongodb(database_name=self.save_dbname,
                                         collection_name=self.save_collection,
                                         query=query,
                                         projection=projection,
                                         query_limit=0)

        if df.empty:
            log.error(
                f"No metrics saved in '{self.save_dbname}.{self.save_collection}' for task '{self.task}' "
                f"and corpus '{self.corpus}'.")
            raise ValueError(
                'Ensure database name, collection name, task name and corpus are correct.'
            )

        # Explode MongoDB fields.
        df_dataset = pd.json_normalize(df['dataset'])
        df_scheduler = pd.json_normalize(df['scheduler'])
        df_model = pd.json_normalize(df['model'])
        df_metrics = pd.json_normalize(df['metrics'])
        exploded_df = pd.concat(
            [df['task_id'], df_dataset, df_scheduler, df_model, df_metrics],
            axis=1)

        # Sort by
        if sort_by is not None and len(sort_by) > 0:
            for col in sort_by:
                assert col in exploded_df.columns, log.error(
                    f"Column '{col}' not exists in saved experiments.")
            exploded_df = exploded_df.sort_values(by=sort_by, ascending=False)

        # Query limit
        if num_models > 0:
            exploded_df = exploded_df.head(num_models)

        # Check if metric name exists.
        df_metrics_columns = df_metrics.columns
        self._metric_name_exits(metric_name, list(df_metrics_columns))

        excluded_fields = []
        if fields is not None:
            for column in exploded_df.columns:
                if column not in fields + metric_name:
                    excluded_fields.append(column)

            exploded_df.drop(list(excluded_fields), axis=1, inplace=True)
            metric_name = None

        # Select fields and metrics
        exploded_df_columns = exploded_df.to_dict('records')
        result = self._get_summary_fields(exploded_df_columns, metric_name,
                                          list(df_metrics_columns))

        return result
Example #11
0
if __name__ == "__main__":
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load the parameters.
    input_params = get_input_params()
    assert input_params is not None, f"No params provided."

    log.info(f"Building 'normalized_clear' collection ...")

    # Load clear collection.
    df_clear = load_dataframe_from_mongodb(
        database_name=input_params['db_name'],
        collection_name=input_params['collection_name'])

    # Check empty Dataframe.
    if 0 == df_clear.shape[0]:
        raise ValueError(
            f"No documents have been retrieved from "
            f"'{input_params['db_name']}.{input_params['collection_name']}' collection."
        )

    df_normalized_clear = df_clear.copy()

    # Normalize short description.
    df_normalized_clear['normalized_short_desc'] = df_normalized_clear[
        'short_desc'].apply(
            lambda x: normalize_incidence(x, to_lower_case=True))