コード例 #1
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    for project in os.environ["BUGZILLA_PROJECT_NAME"].split(","):
        for year in range(int(os.environ['BUGZILLA_FIRST_CREATION_YEAR']),
                          int(os.environ['BUGZILLA_LAST_CREATION_YEAR'])):
            cmd = f"python CreateBugzillaMongoDBCollection.py --p {project}" \
                  f" --db {os.environ['BUGZILLA_MONGODB_DATABASE_NAME']}" \
                  f" --c {project.lower()} --y {year} --sm 1 --em 12"

            # Ejecuta el comando en la consola Windows.
            check_output(cmd, shell=True)
            print(cmd)

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")
コード例 #2
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load the parameters.
    input_params = get_input_params()

    log.info(f"Creating index on all collections ...")

    corpus_list = input_params['corpus_list'] if '' != input_params['corpus_list'] else \
        os.environ["CORPUS_NAME"].split(",")

    for corpus in corpus_list:
        tic = time.time()
        cmd = get_command(os.name, corpus, 'normalized_clear', 'creation_ts',
                          -1)

        # Run command.
        log.info(f"Running command: '{cmd}'")
        os.system(cmd)

    final_time = time.time()
    log.info(
        f"Creating index on all collections total execution time = {((final_time - initial_time) / 60)} minutes"
    )
    log.info(f"MODULE EXECUTED.")
コード例 #3
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client[os.environ["JIRA_MONGODB_DATABASE_NAME"]]
    col = db[os.environ["JIRA_STATISTICS_MONGODB_COLLECTION_NAME"]]

    # Inicializa el resultado.
    statistics_dict = {}

    for project in os.environ["JIRA_PROJECT_NAME"].split(","):
        statistics_dict["project"] = project
        total_bugs = 0
        statistics_json = {}
        for year in range(int(os.environ["JIRA_FIRST_CREATION_YEAR"]), datetime.datetime.now().year + 1):
            statistics_dict[year] = {}
            statistics_dict[year]["_total"] = 0
            total_bugs_year = 0
            # month_statistics_dict = {}
            for month in range(1, 13):
                max_year = year
                max_month = month + 1
                if max_month > 12:
                    max_month = 1
                    max_year += 1
                issues = get_issues_by_date_range(
                    project=project,
                    min_created_date=f"{year}-{str(month).zfill(2)}-01",
                    max_creation_date=f"{max_year}-{str(max_month).zfill(2)}-01",
                    max_results=-False,
                    fields="id"
                )
                # Número total de incidencias del mes analizado.
                # month_statistics_dict.append({"month": month, "count": len(bugs)})
                # month_statistics_dict.append({month: len(bugs)})
                statistics_dict[year][month] = len(issues)
                total_bugs_year += len(issues)
            # Número total de incidencias del año analizado.
            statistics_dict[year]["_total"] = total_bugs_year
            # Estadísticas mensuales.
            # statistics_dict[year]["bugs"] = month_statistics_dict
            log.info(json.dumps(statistics_dict))
            # Número total de incidencias del proyecto.
            total_bugs += total_bugs_year
        statistics_dict["_total"] = total_bugs
        # Almacena las estadísticas para el proyecto.
        statistics_json = json.dumps(statistics_dict)
        col.insert_one(json.loads(statistics_json))
コード例 #4
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    for project in os.environ["GERRIT_PROJECT_NAME"].split(","):
        for year in range(int(os.environ['GERRIT_FIRST_CREATION_YEAR']),
                          int(os.environ['GERRIT_LAST_CREATION_YEAR'])):
            tic = time.time()
            log.info(f"Retrieving Gerrit issues for year '{year}' ...")
            cmd = get_command(os.name, project, year)

            # Run command.
            log.info(f"Running command: '{cmd}'")
            os.system(cmd)
            log.info(
                f"Retrieving Gerrit issues execution time = {((time.time() - tic) / 60)} minutes"
            )

    final_time = time.time()
    log.info(
        f"Retrieving Gerrit issues total execution time = {((final_time - initial_time) / 60)} minutes"
    )
コード例 #5
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Establece los parámetros que se enviarán en la petición.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[os.environ['GERRIT_DB_NAME']]
    log.debug(
        f"Existent collections in '{db.name}': {str(db.list_collection_names())}"
    )

    for project in os.environ["GERRIT_PROJECT_NAME"].split(","):
        for year in range(int(os.environ['GERRIT_FIRST_CREATION_YEAR']),
                          int(os.environ['GERRIT_LAST_CREATION_YEAR'])):
            col = db[f"{project.lower()}_{year}_{year + 1}"]
            if col.name in db.list_collection_names():
                log.info(f"Dropping collection {col.name} ...")
                db.drop_collection(col.name)

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutes")
コード例 #6
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    log.info(f"INICIO DE LA EJECUCIÓN")
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Incializa las variables que almacenarán los argumentos de entrada.
    input_params = get_input_params()

    dup = TreeLstmDuplicateTrain(corpus='bugzilla',
                                 collection='clear',
                                 attention=False,
                                 attention_size=10,
                                 glove_size=100,
                                 hidden_size=100,
                                 max_input=200,
                                 batch_size=1,
                                 optimizer='ADAM',
                                 learning_rate=0.001,
                                 update_embeddings=True,
                                 patience=5).load_or_run()

    output_dir = 'resultados/dump'
コード例 #7
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load the parameters.
    input_params = get_input_params()

    log.info(f"Encoding all pairs labels ...")

    corpus_list = input_params['corpus_list'] if '' != input_params['corpus_list'] else \
        os.environ["CORPUS_NAME"].split(",")

    for corpus in corpus_list:
        log.info(f"Encoding pairs label for corpus: '{corpus}' ...")
        tic = time.time()
        cmd = get_command(os.name, corpus,
                          ','.join(get_pairs_collection_name(corpus)))

        # Run command.
        log.info(f"Running command: '{cmd}'")
        os.system(cmd)
        log.info(
            f"Encoding pairs label total execution time = {((time.time() - tic) / 60)} minutes"
        )

    final_time = time.time()
    log.info(
        f"Encoding all pairs labels total execution time = {((final_time - initial_time) / 60)} minutes"
    )
    log.info(f"MODULE EXECUTED.")
コード例 #8
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    order = 'ascending' if input_params['order'] == 1 else 'descending'
    log.info(
        f"Creating {order} index on field: '{input_params['db_name']}.{input_params['collection_name']}."
        f"{input_params['field_name']}' ...")

    # MongoDB client.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client[input_params['db_name']]
    col = db[input_params['collection_name']]
    col.create_index([
        (input_params['field_name'],
         DESCENDING if input_params['order'] == -1 else ASCENDING)
    ])

    final_time = time.time()
    log.info(
        f"Creating {order} index on field: '{input_params['field_name']}' total execution time = "
        f"{((final_time - initial_time) / 60)} minutes")
コード例 #9
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    for project in os.environ["JIRA_PROJECT_NAME"].split(","):
        for year in range(2001, 2021):
            cmd = f"mongoimport /host {os.environ['MONGO_HOST_IP']} /port {os.environ['MONGO_PORT']}" \
                  f" /username {os.environ['MONGO_USERNAME']} /password {os.environ['MONGO_PASSWORD']}" \
                  f" /authenticationDatabase admin /authenticationMechanism SCRAM-SHA-1" \
                  f" /db {os.environ['JIRA_MONGODB_DATABASE_NAME']}" \
                  f" /collection {project.lower()}_all" \
                  f" /file {Path(ROOT_DIR) / 'data' / project.lower()}_{year}_{year + 1}.json"

            # Ejecuta el comando en la consola Windows.
            check_output(cmd, shell=True)
            print(cmd)

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")
コード例 #10
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Establece los parámetros que se enviarán en la petición.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[os.environ['JIRA_MONGODB_DATABASE_NAME']]
    log.debug(f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}")

    for project in os.environ["JIRA_PROJECT_NAME"].split(","):
        for year in range(2001, 2021):
            col = db[f"{project.lower()}_{year}_{year + 1}"]
            if col.name in db.list_collection_names():
                db.drop_collection(col.name)

    final_time = time.time()
    log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")
コード例 #11
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Incializa las variables que almacenarán los argumentos de entrada.
    input_params = get_input_params()

    # Establece los parámetros que se enviarán en la petición.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[input_params.db_name]
    source_collection = db[input_params.collection_name]
    target_collection = db[f"{input_params.collection_name}_embeddings"]

    log.debug(
        f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}"
    )

    if target_collection.name in db.list_collection_names(
    ) and input_params.drop_collection:
        db.drop_collection(target_collection.name)

    cursor = source_collection.find({}, {
        "creation_ts": "$creation_time",
        "short_desc": "$summary",
        "bug_status": "$status",
        "bug_id": "$id",
        "dup_id": "$dupe_of",
        "resolution": 1,
        "version": 1,
        "product": 1,
        "priority": 1,
        "component": 1,
        "delta_ts": 1,
        "bug_severity": "$severity",
        "description": "$comments.0",
        "normalized_short_desc": 1,
        "normalized_description": 1,
        "comments": {
            "$slice": 1
        }
    })

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")
コード例 #12
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Incializa las variables que almacenarán los argumentos de entrada.
    input_params = get_input_params()

    # Establece los parámetros que se enviarán en la petición.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[input_params['mongo_params'].db_name]
    col = db[
        f"{input_params['mongo_params'].collection_name}"
        f"_{input_params['bz_api_params'].year}_{input_params['bz_api_params'].year + 1}"]

    log.debug(
        f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}"
    )

    if col.name in db.list_collection_names(
    ) and input_params['mongo_params'].drop_collection:
        db.drop_collection(col.name)

    # Para cada año recupera las incidencias utilizando la API de Bugzilla.
    max_year = input_params['bz_api_params'].year
    for month in range(input_params['bz_api_params'].start_month,
                       input_params['bz_api_params'].end_month + 1):
        max_month = month + 1
        if max_month > 12:
            max_month = 1
            max_year += 1
        bugs = get_bugzilla_bugs_by_date_range(
            project=input_params['bz_api_params'].project,
            min_creation_ts=
            f"{input_params['bz_api_params'].year}-{str(month).zfill(2)}-01",
            max_creation_ts=f"{max_year}-{str(max_month).zfill(2)}-01",
            max_results=input_params['bz_api_params'].query_limit,
            include_fields=input_params['bz_api_params'].include_fields,
            get_comments=input_params['bz_api_params'].get_comments)
        save_issues_to_mongodb(mongodb_collection=col, issues=bugs)

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")
コード例 #13
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["bugzilla"]
    eclipse_base_col = db["eclipse_base"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {"_id": 0}

    eclipse_base_data = eclipse_base_col.find({}, fields)
    # .limit(20)

    # Expande el cursor y construye el DataFrame
    eclipse_base = pd.DataFrame(list(eclipse_base_data))

    eclipse_base_summary_col = db["clear_description_nlp_2"]
    eclipse_base_summary_data = eclipse_base_summary_col.find({}, fields)
    eclipse_base_summary = pd.DataFrame(list(eclipse_base_summary_data))

    eclipse_base_data_description_col = db["clear_short_desc_nlp"]
    eclipse_base_data_description_data = eclipse_base_data_description_col.find(
        {}, fields)
    eclipse_base_data_description = pd.DataFrame(
        list(eclipse_base_data_description_data))

    # Merge de los dataframes
    eclipse_base_summary_description = pd.merge(eclipse_base_summary,
                                                eclipse_base_data_description,
                                                on='id')
    eclipse_base_clear = pd.merge(eclipse_base,
                                  eclipse_base_summary_description,
                                  on='id')

    # Almacena el dataframe en MongoDB.
    db["eclipse_base_clear"].insert_many(eclipse_base_clear.to_dict('records'))

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")
コード例 #14
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    # Mongo client.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[os.environ.get('GERRIT_DB_NAME',
                                       input_params['db_name'])]

    gerrit_collections = db.list_collection_names()
    log.debug(
        f"Existent collections in '{db.name}': {str(db.list_collection_names())}"
    )

    for project in os.environ["GERRIT_PROJECT_NAME"].split(","):
        df_list = []
        for year in range(int(os.environ['GERRIT_FIRST_CREATION_YEAR']),
                          int(os.environ['GERRIT_LAST_CREATION_YEAR'])):
            col = db[f"{input_params['collection_name']}"
                     f"_{year}_{year + 1}"]
            if col.name in gerrit_collections:
                tic = time.time()
                log.info(f"Retrieving Gerrit issues for year '{year}' ...")
                data = col.find({}, {'_id': 0})
                df = pd.DataFrame(list(data))
                log.info(f"Gerrit issues for year '{year}': {df.shape[0]}")
                df_list.append(df)
                log.info(
                    f"Retrieving Gerrit issues for year '{year}' execution time = {((time.time() - tic) / 60)} minutes"
                )

        df_concatenated = pd.concat(df_list)
        table_dict = df_concatenated.to_dict("records")
        if project.lower() in gerrit_collections:
            db.drop_collection(project.lower())
        db[project.lower()].insert_many(table_dict)
    final_time = time.time()
    log.info(
        f"Retrieving Gerrit issues total execution time = {((final_time - initial_time) / 60)} minutes"
    )
コード例 #15
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Logger.
    log = set_logger()
    log.debug(f"\n[START OF EXECUTION]")

    load_environment_variables()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Input parameters.
    input_params = get_input_params()

    # Defines javac executable.
    java_exe = Path(os.environ['JAVA_HOME']) / 'bin' / 'java.exe'

    # Command.
    cmd = f"{java_exe} -cp {get_java_classpath()} {input_params['nlp_params'].java_class_name} " \
          f"-host {input_params['mongo_params'].host} " \
          f"-port {input_params['mongo_params'].port} " \
          f"-dbName {input_params['mongo_params'].db_name} " \
          f"-collName {input_params['mongo_params'].collection_name} " \
          f"-startYear {input_params['filter_params'].start_year} " \
          f"-endYear {input_params['filter_params'].end_year} " \
          f"-textColumnName {input_params['filter_params'].column_name} " \
          f"-maxNumTokens {input_params['nlp_params'].max_num_tokens} " \
          f"-parserModel {input_params['nlp_params'].parser_model} " \
          f"-createTrees {input_params['nlp_params'].get_trees} " \
          f"-calcEmbeddings {input_params['nlp_params'].get_embeddings} " \
          f"-calcCoherence {input_params['nlp_params'].get_coherence}"

    log.info(f"Running command: '{cmd}'")

    # Run command.
    os.system(cmd)

    log.info(f"\n[END OF EXECUTION]")
    final_time = time.time()
    log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutes")
コード例 #16
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load the parameters.
    input_params = get_input_params()

    log.info(f"Filtering all word embeddings ...")

    corpus_list = input_params['corpus_list'] if '' != input_params['corpus_list'] else \
        os.environ["CORPUS_NAME"].split(",")
    embeddings_model_list = input_params['embeddings_model_list'] if '' != input_params['embeddings_model_list'] else \
        os.environ["EMBEDDINGS_MODEL"].split(",")
    embeddings_size_list = input_params['embeddings_size_list'] if '' != input_params['embeddings_size_list'] else \
        os.environ["EMBEDDINGS_SIZE"].split(",")

    for corpus in corpus_list:
        log.info(f"Filtering word embeddings for corpus: '{corpus}' ...")
        for model in embeddings_model_list:
            log.info(f"Filtering word embeddings for model: '{model}' ...")
            for size in embeddings_size_list:
                tic = time.time()
                log.info(
                    f"Filtering pre-trained word embeddings of size: '{int(size)}' ..."
                )
                cmd = get_command(os.name, corpus, model, int(size), True)

                # Run command.
                log.info(f"Running command: '{cmd}'")
                os.system(cmd)
                log.info(
                    f"Filtering pre-trained word embeddings total execution time "
                    f"= {((time.time() - tic) / 60)} minutes")

    final_time = time.time()
    log.info(
        f"Filtering all word embeddings total execution time = {((final_time - initial_time) / 60)} minutes"
    )
    log.info(f"MODULE EXECUTED.")
コード例 #17
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load the parameters.
    input_params = get_input_params()

    log.info(f"Building all datasets ...")

    task_list = input_params['task_list'].split(",") if '' != input_params['task_list'] else \
        os.environ["TASK_NAME"].split(",")
    corpus_list = input_params['corpus_list'].split(",") if '' != input_params['corpus_list'] else \
        os.environ["CORPUS_NAME"].split(",")

    for corpus in corpus_list:
        log.info(f"Building datasets for corpus: '{corpus}' ...")
        for task in task_list:
            tic = time.time()
            log.info(f"Building datasets for task: '{task}' ...")
            cmd1 = get_command(os.name, task, corpus, False)
            cmd2 = get_command(os.name, task, corpus, True)

            # Run command.
            log.info(f"Running command: '{cmd1}'")
            os.system(cmd1)
            log.info(f"Building unbalanced dataset total execution time = {((time.time() - tic) / 60)} minutes")
            tic = time.time()
            log.info(f"Running command: '{cmd2}'")
            os.system(cmd2)
            log.info(f"Building balanced dataset total execution time = {((time.time() - tic) / 60)} minutes")

    final_time = time.time()
    log.info(f"Building all datasets total execution time = {((final_time - initial_time) / 60)} minutes")
    log.info(f"MODULE EXECUTED.")
コード例 #18
0
ファイル: useful_data.py プロジェクト: mlazarodominguez/syn
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    log.info(f"Processing Gerrit issues ...")

    # MongoDB data.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client[os.environ.get('GERRIT_DB_NAME',
                                       input_params['db_name'])]
    col = db[input_params['collection_name']]
    data = col.find({}, {'_id': 0})
    df = pd.DataFrame(list(data))
    df['bug_id'] = -1
    # Search for bugs that don't matches this pattern.
    bug_id_list = []
    loop = tqdm(range(df.shape[0]))
    for i in loop:
        matched_1 = re.search(r'\[\s*\+?(-?\d+)\s*]', df.loc[i, 'subject'])
        matched_2 = re.search(r'[Bb][Uu][Gg]\s[0-9]+', df.loc[i, 'subject'])
        is_match = bool(matched_1) + bool(matched_2)
        if bool(is_match):
            flag = True
            res = re.findall(r'[Bb][Uu][Gg]\s[0-9]+', df.iloc[i]['subject'])
            if not res:
                flag = False
                res = re.findall(r'\[\s*\+?(-?\d+)\s*]', df.iloc[i]['subject'])
            bug_id = int(res[0][4:]) if flag else int(res[0])
            if bug_id in bug_id_list:
                idx = df.index[df['bug_id'] == bug_id].tolist()
                if len(idx) > 1:
                    raise ValueError(
                        f"There are more than one bug with 'bug_id' = {bug_id}"
                    )
                previous_file_list = get_filtered_file_list(
                    df.iloc[idx[0]]['file_list'])
                current_file_list = get_filtered_file_list(
                    df.iloc[i]['file_list'])
                file_list = set(previous_file_list + current_file_list)
                df.at[idx[0], 'file_list'] = list(file_list)
            else:
                bug_id_list.append(bug_id)
                df.at[i, 'bug_id'] = bug_id
                df.at[i, 'file_list'] = get_filtered_file_list(
                    df.iloc[i]["file_list"])

    # Drop collection if already exists.
    if input_params['output_collection_name'] in db.list_collection_names():
        log.info(
            f"Dropping collection {input_params['output_collection_name']} ..."
        )
        db.drop_collection(input_params['output_collection_name'])

    # Split dataframe in batches to avoid pymongo.errors.CursorNotFound.
    num_batches = math.ceil(df.shape[0] / input_params['batch_size'])
    batches = np.array_split(df.loc[df['bug_id'] != -1], num_batches)

    # Insert documents with bug_id in MongoDB.
    inserted_docs_number = 0
    for batch in batches:
        log.info(f"Inserting documents ...")
        inserted_documents = db[
            input_params['output_collection_name']].insert_many(
                batch.to_dict("records"))
        inserted_docs_number += len(inserted_documents.inserted_ids)

    final_time = time.time()
    log.info(
        f"Processing Gerrit issues total execution time = {((final_time - initial_time) / 60)} minutes"
    )
コード例 #19
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    log.info(f"Finding similar issues ...")

    # MongoDB client.
    mongodb_client: MongoClient = get_default_mongo_client()

    # Load Gerrit data.
    db = mongodb_client[os.environ.get('GERRIT_DB_NAME',
                                       input_params['db_name'])]
    collection = db[input_params['collection_name']]
    data = collection.find({}, {'_id': 0})
    df = pd.DataFrame(list(data))

    # Check empty Dataframe.
    if 0 == df.shape[0]:
        raise ValueError(f"No documents have been retrieved from "
                         f"'{db.name}.{collection.name}' collection.")

    # Initialize empty list column.
    df['sim_bugs'] = [[] for i in range(len(df))]
    df['near_bugs'] = [[] for i in range(len(df))]

    # Iterate over all rows.
    all_sim_bugs = []
    all_near_bugs = []
    for i in tqdm(range(df.shape[0])):
        # Compare each row with all rows.
        for j in range(df.shape[0]):
            if j == i:
                continue
            bug_anc = df.at[i, 'bug_id']
            bug_pos = df.loc[j, 'bug_id']
            jaccard_similarity = jaccard_score(df.loc[i, 'file_list'],
                                               df.loc[j, 'file_list'])
            if jaccard_similarity >= float(
                    input_params['similarity_threshold']):
                if [bug_anc, bug_pos] not in all_sim_bugs and [
                        bug_pos, bug_anc
                ] not in all_sim_bugs:
                    df.at[i, 'sim_bugs'].append(int(bug_pos))
                    all_sim_bugs.append([bug_anc, bug_pos])

            if float(input_params['similarity_threshold']) - 0.25 <= jaccard_similarity < \
                    float(input_params['similarity_threshold']):
                if [bug_anc, bug_pos] not in all_near_bugs and [
                        bug_pos, bug_anc
                ] not in all_near_bugs:
                    df.at[i, 'near_bugs'].append(int(bug_pos))
                    all_near_bugs.append([bug_anc, bug_pos])

    # Split dataframe in batches to avoid pymongo.errors.CursorNotFound.
    num_batches = math.ceil(df.shape[0] / input_params['batch_size'])
    batches = np.array_split(df, num_batches)

    output_collection = db[input_params['output_collection_name']]

    # Drop collection if already exists.
    if output_collection.name in db.list_collection_names():
        log.info(
            f"Dropping collection '{db.name}.{output_collection.name}' ...")
        db.drop_collection(output_collection.name)

    inserted_docs_number = 0
    for batch in batches:
        log.info(f"Inserting documents ...")
        inserted_documents = output_collection.insert_many(
            batch.to_dict("records"))
        inserted_docs_number += len(inserted_documents.inserted_ids)

    log.info(f"Inserted documents: {inserted_docs_number}")

    final_time = time.time()
    log.info(
        f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes"
    )
コード例 #20
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    log.info(f"Finding duplicate issues ...")

    # MongoDB client.
    mongodb_client: MongoClient = get_default_mongo_client()

    # Load Bugzilla data.
    db = mongodb_client[input_params['input_db_name']]

    # Collection all issues (duplicated and no duplicated).
    original_collection = db[input_params['original_collection_name']]
    query = {}
    projection = {'_id': 0, 'bug_id': 1}
    log.info(f"Reading data from '{db.name}.{original_collection.name}' ...")
    original_data = original_collection.find(query, projection)
    df_original = pd.DataFrame(list(original_data))

    # Check empty Dataframe.
    if 0 == df_original.shape[0]:
        raise ValueError(f"No documents have been retrieved from "
                         f"'{db.name}.{original_collection.name}' collection.")

    # List with all bug_id.
    original_bug_id_list = df_original['bug_id'].to_list()

    collection_pairs_first_step = db[input_params['input_collection_name']]
    query = {}
    projection = {'_id': 0}
    data_pairs_first_step = collection_pairs_first_step.find(query, projection)
    df_pairs_first_step = pd.DataFrame(list(data_pairs_first_step))

    # Check empty Dataframe.
    if 0 == df_pairs_first_step.shape[0]:
        raise ValueError(
            f"No documents have been retrieved from "
            f"'{db.name}.{collection_pairs_first_step.name}' collection.")

    bug1_list = df_pairs_first_step['bug1'].to_list()
    bug2_list = df_pairs_first_step['bug2'].to_list()

    no_duplicate_bug_id_list = set(original_bug_id_list) - set(
        bug1_list) - set(bug2_list)

    # For each issue that have similar issues generate a pair of similar issues and a pair of no similar issues.
    pairs_from_indirect_relations = check_indirect_relations(
        df_pairs_first_step, list(no_duplicate_bug_id_list))
    log.info(f"Pairs generated: {len(pairs_from_indirect_relations)}")

    # Dataframe pairs.
    df_pairs = pd.DataFrame(pairs_from_indirect_relations)

    # Split dataframe in batches to avoid pymongo.errors.CursorNotFound.
    num_batches = math.ceil(df_pairs.shape[0] / input_params['batch_size'])
    batches = np.array_split(df_pairs, num_batches)

    output_collection = db[input_params['output_collection_name']]

    # Drop collection if already exists.
    if output_collection.name in db.list_collection_names():
        log.info(
            f"Dropping collection '{db.name}.{output_collection.name}' ...")
        db.drop_collection(output_collection.name)

    inserted_docs_number = 0
    for batch in batches:
        log.info(f"Inserting documents ...")
        inserted_documents = output_collection.insert_many(
            batch.to_dict("records"))
        inserted_docs_number += len(inserted_documents.inserted_ids)

    log.info(f"Inserted documents: {inserted_docs_number}")

    final_time = time.time()
    log.info(
        f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes"
    )
コード例 #21
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["eclipse"]
    col = db["clear"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {
        "_id": 0,
        "bug_id": 1,
        # "product": 1,
        # "description": 1,
        # "bug_severity": 1,
        # "dup_id": 1,
        "short_desc": 1,
        # "priority": 1,
        # "version": 1,
        # "component": 1,
        # "delta_ts": 1,
        "bug_status": 1
        # "creation_ts": 1,
        # "resolution": 1
    }

    # Se utilizarán sólo las incidencias resueltas.
    query = {
        '$or': [{
            'bug_status': {
                '$eq': 'RESOLVED'
            }
        }, {
            'bug_status': {
                '$eq': 'VERIFIED'
            }
        }]
    }

    data = col.find(query, fields)
    # .limit(20)

    # Expande el cursor y construye el DataFrame
    clear_nlp = pd.DataFrame(list(data))

    print(clear_nlp)

    nltk.download("stopwords", quiet=True)

    clear_nlp["short_desc_split_alpha"] = clear_nlp["short_desc"].apply(
        lambda x: clean_doc_split(x))
    clear_nlp["short_desc_lower"] = clear_nlp["short_desc_split_alpha"].apply(
        lambda x: clean_doc_lower(x))
    clear_nlp["short_desc_punctuaction"] = clear_nlp["short_desc_lower"].apply(
        lambda x: clean_doc_punctuaction(x))
    clear_nlp["short_desc_trim"] = clear_nlp["short_desc_punctuaction"].apply(
        lambda x: clean_doc_trim(x))
    clear_nlp["short_desc_isalpha"] = clear_nlp["short_desc_trim"].apply(
        lambda x: clean_doc_isalpha(x))
    clear_nlp["short_desc_stop_words"] = clear_nlp["short_desc_isalpha"].apply(
        lambda x: clean_doc_stopW(x))
    clear_nlp["short_desc_diacritic"] = clear_nlp[
        "short_desc_stop_words"].apply(lambda x: clean_doc_diacri(x))
    clear_nlp["short_desc_lemmatizer"] = clear_nlp[
        "short_desc_diacritic"].apply(lambda x: clean_doc_lem(x))

    # Elimina la columna 'bug_status' porque se realizará después un merge con la colección original.
    clear_nlp.drop('bug_status', axis=1, inplace=True)

    # Almacena el dataframe en MongoDB.
    db["clear_short_desc_nlp"].insert_many(clear_nlp.to_dict('records'))

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")
コード例 #22
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["bugzilla"]
    col = db["eclipse_all"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {
        "_id": 0,
        "assigned_to": 1,
        "assigned_to_detail": 1,
        "classification": 1,
        "component": 1,
        "creation_time": 1,
        "creator": 1,
        "creator_detail": 1,
        "dupe_of": 1,
        "id": 1,
        "op_sys": 1,
        "platform": 1,
        "priority": 1,
        "product": 1,
        "resolution": 1,
        "severity": 1,
        "status": 1,
        "summary": 1,
        "version": 1,
        "description": "$comments.text"
    }
    aggregation_project = {"$project": fields}

    # Se utilizarán sólo las incidencias resueltas.

    aggregation_match = {"$match": {
        '$and': [
            {
                '$or': [
                    {'status': {'$eq': 'RESOLVED'}},
                    {'status': {'$eq': 'VERIFIED'}},
                    {'status': {'$eq': 'CLOSED'}}
                ]
            },
            {'comments': {'$exists': 'true'}},
            {'comments': {'$ne': 'null'}},
            {'comments': {'$ne': ""}},
            {'comments': {'$not': {'$size': 0}}},
            {'comments.count': {'$eq': 0}}
        ]
    }}
    aggregation_unwind = {"$unwind": "$comments"}
    aggregation_limit = {"$limit": 20}

    # data = col.find(query, fields).limit(20)
    data = col.aggregate([
        aggregation_unwind,
        aggregation_match,
        # aggregation_limit,
        aggregation_project
    ])

    # Expande el cursor y construye el DataFrame
    eclipse_base = pd.DataFrame(list(data))

    # Almacena el dataframe en MongoDB.
    db["eclipse_base"].insert_many(eclipse_base.to_dict('records'))

    final_time = time.time()
    log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")
コード例 #23
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Logger.
    log = set_logger()

    log.debug(f"\n[START OF EXECUTION]")

    load_environment_variables()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Years range.
    input_params = get_input_params()

    # Databases.
    databases = [input_params['corpus']] if input_params['corpus'] != '' \
        else os.environ["EMBEDDING_MONGODB_DATABASE_NAME"].split(",")

    # Java class.
    java_class_name = "UpdateMongoDBNLPFields"

    # Control params.
    model_param = f"--pm {'corenlp'}" if (input_params['get-coherence'] and input_params['get-trees']) \
        else f"--pm {'srparser'}"
    trees_param = "--get-trees" if input_params['get-trees'] else "--no-get-trees"
    embeddings_param = "--get-embeddings " if (input_params['get-embeddings'] and input_params['get-trees']) \
        else "--no-get-embeddings"
    coherence_param = "--get-coherence" if (input_params['get-coherence'] and input_params['get-trees']) \
        else "--no-get-coherence"

    # Defines Python executable.
    python_exe = os.environ.get('PYTHON_EXECUTABLE', sys.executable)

    # Loop for obtain tokens number.
    tokens_initial_time = time.time()
    log.info(f"Updating NLP fields ...")
    for db in databases:
        log.info(f"\nProcessing database: '{db}'.")
        for year in range(input_params['start_year'], input_params['end_year']):
            log.info(f"\n[FOR LOOP] Processing years: {year} - {year + 1}")
            cmd = f"{python_exe} UpdateVectorizedMongoDBCollection.py --jcn {java_class_name}" \
                  f" --mh {os.environ['MONGO_HOST_IP']}" \
                  f" --mp {os.environ['MONGO_PORT']}" \
                  f" --db {db}" \
                  f" --c {os.environ['EMBEDDING_MONGODB_COLLECTION_NAME']}" \
                  f" --cl {os.environ['EMBEDDING_MONGODB_COLUMN_NAME']}" \
                  f" --sy {year}" \
                  f" --ey {year + 1} " \
                  f"--mnt {os.environ['EMBEDDING_MONGODB_MAX_NUM_TOKENS']} " \
                  f"{model_param} {trees_param} {embeddings_param} {coherence_param}"

            # Run command.
            log.info(f"Running command: '{cmd}'.")
            os.system(cmd)
    log.info(f"Updating NLP fields total execution time = {((time.time() - tokens_initial_time) / 60)} minutes")

    log.debug(f"\n[END OF EXECUTION]")
    final_time = time.time()
    log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutes")
コード例 #24
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["bugzilla"]
    col = db["eclipse_all"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {
        "_id": 0,
        "assigned_to": 1,
        "assigned_to_detail": 1,
        "classification": 1,
        "component": 1,
        "creation_time": 1,
        "creator": 1,
        "creator_detail": 1,
        "dupe_of": 1,
        "id": 1,
        "op_sys": 1,
        "platform": 1,
        "priority": 1,
        "product": 1,
        "resolution": 1,
        "severity": 1,
        "status": 1,
        "summary": 1,
        "version": 1,
        "description": "$comments.text"
    }
    aggregation_project = {"$project": fields}

    # Se utilizarán sólo las incidencias resueltas.
    aggregation_match = {"$match": {
        '$and': [
            {
                '$or': [
                    {'status': {'$eq': 'RESOLVED'}},
                    {'status': {'$eq': 'VERIFIED'}},
                    {'status': {'$eq': 'CLOSED'}}
                ]
            },
            {'comments.count': {'$eq': 0}}
        ]
    }}
    aggregation_unwind = {"$unwind": "$comments"}
    aggregation_limit = {"$limit": 20}

    # data = col.find(query, fields).limit(20)
    data = col.aggregate([
        aggregation_unwind,
        aggregation_match,
        aggregation_limit,
        aggregation_project
    ])

    # Expande el cursor y construye el DataFrame
    eclipse_base = pd.DataFrame(list(data))

    print(eclipse_base.head(10))

    # 1º-Convertir la variable en tipo categórica:
    eclipse_base.priority = eclipse_base.priority.astype('category')
    print(eclipse_base.head(10))

    # 2º-Catergorizar:
    eclipse_base['priority_cod'] = eclipse_base['priority'].cat.codes

    print(eclipse_base.head(10))

    # 1- Primero creas una instancia:
    #    le = sklearn.preprocessing.LabelEncoder()
    # 2- Después ajustas a tus datos:
    #    le.fit(labels)  (en este caso, "labels = data[:,0]" era la columna con las predicciones).
    # 3- Obtienes la columna con los valores transformados:
    #    labels = le.transform(labels)
    # 4- Puedes guardar los valores originales para establecer una relación:
    #    class_names = le.classes_


    # df["creation_time_year"] = df["creation_time"].str[:4]
    # df["last_change_time_year"] = df["last_change_time"].str[:4]
    # df["resolution_string"] = df["resolution"].apply(lambda y: "EMPTY_FIELD" if len(y) == 0 else y)

    final_time = time.time()
    log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")
コード例 #25
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    # Mongo client.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[os.environ.get('GERRIT_DB_NAME',
                                       input_params['db_name'])]
    year = input_params['year']
    col = db[f"{input_params['collection_name']}" f"_{year}_{year + 1}"]

    log.debug(
        f"Existent collections in '{db.name}': {str(db.list_collection_names())}"
    )
    if col.name in db.list_collection_names():
        db.drop_collection(col.name)

    # Get Gerrit Issues by month.
    max_year = year
    for month in range(input_params['start_month'],
                       input_params['end_month'] + 1):
        max_month = month + 1
        if max_month > 12:
            max_month = 1
            max_year += 1

        after = str(datetime.datetime(input_params['year'], month, 1))
        before = str(datetime.datetime(max_year, max_month, 1))
        s = 0
        change_id, project, status, date, subject, author, committer, commit_msg = [], [], [], [], [], [], [], []
        file_list = []
        skip = ''
        flag = True
        log.info(f"Retrieving issues in range: {after} - {before}")
        while flag:
            issues_by_month = get_data_from_gerrit_rest_api(
                os.environ['GERRIT_API_URL'], after, before, skip=skip)
            log.info(f"Read issues: {len(issues_by_month)}")
            if len(issues_by_month) < 1:
                print('No issues for month {} and year {}'.format(month, year))
                flag = False
            else:
                for issue in issues_by_month:
                    change_id.append(issue['change_id'])
                    project.append(issue['project'])
                    status.append(issue['status'])
                    date.append(issue['updated'])
                    subject.append(issue['subject'])
                    current_rev = issue['current_revision']
                    author.append(issue['revisions'][current_rev]['commit']
                                  ['author']['email'])
                    committer.append(issue['revisions'][current_rev]['commit']
                                     ['committer']['email'])
                    commit_msg.append(
                        issue['revisions'][current_rev]['commit']['message'])
                    file_list.append(
                        list(issue['revisions'][current_rev]['files'].keys()))
                if len(issues_by_month) == 100:
                    last_dict = issues_by_month[99]
                    try:
                        if last_dict['_more_changes']:
                            log.info(f"There are more issues to read")
                            s = s + 100
                            skip = '&S=' + str(s)
                    except KeyError:
                        flag = False
                        print('Issues for month {} and year {}: {}'.format(
                            month, year, len(change_id)))
                else:
                    print('Issues for month {} and year {}: {}'.format(
                        month, year, len(change_id)))
                    flag = False
        dict_of_reg = {
            'change_id': change_id,
            'project': project,
            'status': status,
            'date': date,
            'subject': subject,
            'author': author,
            'committer': committer,
            'commit_msg': commit_msg,
            'file_list': file_list
        }
        df = pd.DataFrame(dict_of_reg)
        issues = df.to_dict("records")
        save_issues_to_mongodb(mongodb_collection=col, issues=issues)

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutes")
コード例 #26
0
"""Perform hyperparemeters fit"""

import os
import time

from syn.helpers.hyperparams import get_input_params
from syn.helpers.logging import set_logger
from syn.helpers.system import check_same_python_module_already_running
from syn.model.build.common.task import ConsensusFit

log = set_logger()

if __name__ == "__main__":
    initial_time = time.time()
    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    log.info(f"Fitting hyperparameters ...")

    # Load parameter space.
    input_param_space = get_input_params()
    assert input_param_space is not None, f"No param space provided."

    fitter = ConsensusFit(
        database_name='tasks',
        collection_name='experiments',
        corpus='openOffice',
        tasks_objectives={
            'duplicity': 'accuracy',
            'prioritization': 'jaccard_micro'
        },
コード例 #27
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    log.info(f"Merging Gerrit and Bugzilla issues ...")

    # MongoDB client.
    mongodb_client: MongoClient = get_default_mongo_client()

    # Load Gerrit data.
    log.info(f"Loading Gerrit issues ...")
    tic = time.time()
    gerrit_db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['gerrit_db_name'])]
    gerrit_collection = gerrit_db[input_params['gerrit_collection_name']]
    gerrit_data = gerrit_collection.find({}, {'_id': 0})
    df_gerrit = pd.DataFrame(list(gerrit_data))
    log.info(f"Loading Gerrit issues total time: {(time.time() - tic) / 60} minutes.")

    # Check empty Dataframe.
    if 0 == df_gerrit.shape[0]:
        raise ValueError(f"No documents have been retrieved from "
                         f"'{gerrit_db.name}.{gerrit_collection.name}' collection.")

    # Load Bugzilla data.
    log.info(f"Loading Bugzilla issues ...")
    tic = time.time()
    bugzilla_db = mongodb_client[os.environ.get('BUGZILLA_MONGODB_DATABASE_NAME', input_params['bugzilla_db_name'])]
    bugzilla_collection = bugzilla_db[input_params['bugzilla_collection_name']]
    bugzilla_data = bugzilla_collection.find({}, {'_id': 0, 'bug_id': 1})
    df_bugzilla = pd.DataFrame(list(bugzilla_data))
    log.info(f"Loading Gerrit issues total time: {(time.time() - tic) / 60} minutes.")

    # Check empty Dataframe.
    if 0 == df_bugzilla.shape[0]:
        raise ValueError(f"No documents have been retrieved from "
                         f"'{bugzilla_db.name}.{bugzilla_collection.name}' collection.")

    # Join on column 'bug_id'.
    log.info(f"Joining Gerrit and Bugzilla Dataframes ...")
    tic = time.time()
    df_joined = df_gerrit.merge(df_bugzilla, left_on='bug_id', right_on='bug_id')
    log.info(f"Joining Gerrit and Bugzilla Dataframes total time: {(time.time() - tic) / 60} minutes.")

    # Split dataframe in batches to avoid pymongo.errors.CursorNotFound.
    num_batches = math.ceil(df_joined.shape[0] / input_params['batch_size'])
    batches = np.array_split(df_joined, num_batches)

    output_collection = gerrit_db[input_params['output_collection_name']]

    # Drop collection if already exists.
    if input_params['output_collection_name'] in gerrit_db.list_collection_names():
        log.info(f"Dropping collection {input_params['output_collection_name']} ...")
        gerrit_db.drop_collection(input_params['output_collection_name'])

    inserted_docs_number = 0
    tic = time.time()
    for batch in batches:
        log.info(f"Inserting documents ...")
        inserted_documents = output_collection.insert_many(batch.to_dict("records"))
        inserted_docs_number += len(inserted_documents.inserted_ids)

    log.info(f"Inserted documents: {inserted_docs_number}")
    log.info(f"Inserting documents total time: {(time.time() - tic) / 60} minutes.")

    final_time = time.time()
    log.info(f"Merging Gerrit and Bugzilla issues total execution time = {((final_time - initial_time) / 60)} minutes")
コード例 #28
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["eclipse"]
    clear_col = db["clear"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {
        "_id": 0,
    }

    # Se utilizarán sólo las incidencias resueltas.
    query = {
        '$or': [{
            'bug_status': {
                '$eq': 'RESOLVED'
            }
        }, {
            'bug_status': {
                '$eq': 'VERIFIED'
            }
        }]
    }

    clear_data = clear_col.find(query, fields)
    # .limit(20)

    # Expande el cursor y construye el DataFrame
    clear = pd.DataFrame(list(clear_data))

    clear_short_desc_nlp_col = db["clear_short_desc_nlp"]
    clear_short_desc_nlp_data = clear_short_desc_nlp_col.find({}, fields)
    clear_short_desc_nlp = pd.DataFrame(list(clear_short_desc_nlp_data))

    clear_description_nlp_col = db["clear_description_nlp_2"]
    clear_description_nlp_data = clear_description_nlp_col.find({}, fields)
    clear_description_nlp = pd.DataFrame(list(clear_description_nlp_data))

    # Merge de los dataframes
    clear_short_desc_description_nlp = pd.merge(clear_short_desc_nlp,
                                                clear_description_nlp,
                                                on='bug_id')
    clear_nlp = pd.merge(clear, clear_short_desc_description_nlp, on='bug_id')

    # Almacena el dataframe en MongoDB.
    db["clear_nlp"].insert_many(clear_nlp.to_dict('records'))

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")
コード例 #29
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    log.info(f"Finding similar issues ...")

    # MongoDB client.
    mongodb_client: MongoClient = get_default_mongo_client()

    # Load Gerrit data.
    db = mongodb_client[os.environ.get('GERRIT_DB_NAME',
                                       input_params['db_name'])]
    collection = db[input_params['collection_name']]
    data = collection.find({}, {'_id': 0})
    df = pd.DataFrame(list(data))

    # Check empty Dataframe.
    if 0 == df.shape[0]:
        raise ValueError(f"No documents have been retrieved from "
                         f"'{db.name}.{collection.name}' collection.")

    similar_column_name = 'sim_bugs' if not input_params[
        'near_issues'] else 'near_bugs'
    # For each issue that have similar issues generate a pair of similar issues and a pair of no similar issues.
    pairs = generate_pairs(df, similar_column_name)
    log.info(f"Pairs generated: {len(pairs)}")

    # Dataframe pairs.
    df_pairs = pd.DataFrame(pairs)

    # Split dataframe in batches to avoid pymongo.errors.CursorNotFound.
    num_batches = math.ceil(df_pairs.shape[0] / input_params['batch_size'])
    batches = np.array_split(df_pairs, num_batches)

    output_db = mongodb_client[input_params['output_db_name']]
    output_collection = output_db[input_params['output_similar_collection_name']] if not input_params['near_issues'] \
        else output_db[input_params['output_near_collection_name']]

    # Drop collection if already exists.
    if output_collection.name in output_db.list_collection_names():
        log.info(
            f"Dropping collection '{db.name}.{output_collection.name}' ...")
        db.drop_collection(output_collection.name)

    inserted_docs_number = 0
    for batch in batches:
        log.info(f"Inserting documents ...")
        inserted_documents = output_collection.insert_many(
            batch.to_dict("records"))
        inserted_docs_number += len(inserted_documents.inserted_ids)

    log.info(f"Inserted documents: {inserted_docs_number}")

    final_time = time.time()
    log.info(
        f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes"
    )
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["bugzilla"]
    col = db["eclipse_base"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {
        "_id": 0,
        # "assigned_to": 1,
        # "assigned_to_detail": 1,
        # "classification": 1,
        # "component": 1,
        # "creation_time": 1,
        # "creator": 1,
        # "creator_detail": 1,
        # "dupe_of": 1,
        "id": 1,
        # "op_sys": 1,
        # "platform": 1,
        # "priority": 1,
        # "product": 1,
        # "resolution": 1,
        # "severity": 1,
        # "status": 1,
        "summary": 1
        # "version": 1,
        # "description": 1
    }

    data = col.find({}, fields)
    # .limit(20)

    # Expande el cursor y construye el DataFrame
    clear_nlp = pd.DataFrame(list(data))

    print(clear_nlp)

    nltk.download("stopwords", quiet=True)

    clear_nlp["summary_split_alpha"] = clear_nlp["summary"].apply(
        lambda x: clean_doc_split(x))
    clear_nlp["summary_lower"] = clear_nlp["summary_split_alpha"].apply(
        lambda x: clean_doc_lower(x))
    clear_nlp["summary_punctuaction"] = clear_nlp["summary_lower"].apply(
        lambda x: clean_doc_punctuaction(x))
    clear_nlp["summary_trim"] = clear_nlp["summary_punctuaction"].apply(
        lambda x: clean_doc_trim(x))
    clear_nlp["summary_isalpha"] = clear_nlp["summary_trim"].apply(
        lambda x: clean_doc_isalpha(x))
    clear_nlp["summary_stop_words"] = clear_nlp["summary_isalpha"].apply(
        lambda x: clean_doc_stopW(x))
    clear_nlp["summary_diacritic"] = clear_nlp["summary_stop_words"].apply(
        lambda x: clean_doc_diacri(x))
    clear_nlp["summary_lemmatizer"] = clear_nlp["summary_diacritic"].apply(
        lambda x: clean_doc_lem(x))

    # Almacena el dataframe en MongoDB.
    db["eclipse_base_summary_clear"].insert_many(clear_nlp.to_dict('records'))

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")