コード例 #1
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Logger.
    log = set_logger()
    log.debug(f"\n[START OF EXECUTION]")

    load_environment_variables()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Input parameters.
    input_params = get_input_params()

    # Defines javac executable.
    java_exe = Path(os.environ['JAVA_HOME']) / 'bin' / 'java.exe'

    # Command.
    cmd = f"{java_exe} -cp {get_java_classpath()} {input_params['nlp_params'].java_class_name} " \
          f"-host {input_params['mongo_params'].host} " \
          f"-port {input_params['mongo_params'].port} " \
          f"-dbName {input_params['mongo_params'].db_name} " \
          f"-collName {input_params['mongo_params'].collection_name} " \
          f"-startYear {input_params['filter_params'].start_year} " \
          f"-endYear {input_params['filter_params'].end_year} " \
          f"-textColumnName {input_params['filter_params'].column_name} " \
          f"-maxNumTokens {input_params['nlp_params'].max_num_tokens} " \
          f"-parserModel {input_params['nlp_params'].parser_model} " \
          f"-createTrees {input_params['nlp_params'].get_trees} " \
          f"-calcEmbeddings {input_params['nlp_params'].get_embeddings} " \
          f"-calcCoherence {input_params['nlp_params'].get_coherence}"

    log.info(f"Running command: '{cmd}'")

    # Run command.
    os.system(cmd)

    log.info(f"\n[END OF EXECUTION]")
    final_time = time.time()
    log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutes")
コード例 #2
0
import os
import random
import time

import numpy as np
import pandas as pd
from pymongo import MongoClient
from tqdm import tqdm

from syn.helpers.environment import load_environment_variables
from syn.helpers.logging import set_logger
from syn.helpers.mongodb import get_default_mongo_client
from syn.helpers.system import check_same_python_module_already_running

load_environment_variables()
log = set_logger()


def get_input_params():
    parser = argparse.ArgumentParser(
        description='Generate pairs for similar issues.')

    parser.add_argument('--db_name',
                        default='gerrit',
                        type=str,
                        help='Gerrit database name.')
    parser.add_argument('--collection_name',
                        default='eclipse_similarities',
                        type=str,
                        help='Gerrit similarities collection name.')
    parser.add_argument('--output_db_name',
コード例 #3
0
def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Logger.
    log = set_logger()

    log.debug(f"\n[START OF EXECUTION]")

    load_environment_variables()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Years range.
    input_params = get_input_params()

    # Databases.
    databases = [input_params['corpus']] if input_params['corpus'] != '' \
        else os.environ["EMBEDDING_MONGODB_DATABASE_NAME"].split(",")

    # Java class.
    java_class_name = "UpdateMongoDBNLPFields"

    # Control params.
    model_param = f"--pm {'corenlp'}" if (input_params['get-coherence'] and input_params['get-trees']) \
        else f"--pm {'srparser'}"
    trees_param = "--get-trees" if input_params['get-trees'] else "--no-get-trees"
    embeddings_param = "--get-embeddings " if (input_params['get-embeddings'] and input_params['get-trees']) \
        else "--no-get-embeddings"
    coherence_param = "--get-coherence" if (input_params['get-coherence'] and input_params['get-trees']) \
        else "--no-get-coherence"

    # Defines Python executable.
    python_exe = os.environ.get('PYTHON_EXECUTABLE', sys.executable)

    # Loop for obtain tokens number.
    tokens_initial_time = time.time()
    log.info(f"Updating NLP fields ...")
    for db in databases:
        log.info(f"\nProcessing database: '{db}'.")
        for year in range(input_params['start_year'], input_params['end_year']):
            log.info(f"\n[FOR LOOP] Processing years: {year} - {year + 1}")
            cmd = f"{python_exe} UpdateVectorizedMongoDBCollection.py --jcn {java_class_name}" \
                  f" --mh {os.environ['MONGO_HOST_IP']}" \
                  f" --mp {os.environ['MONGO_PORT']}" \
                  f" --db {db}" \
                  f" --c {os.environ['EMBEDDING_MONGODB_COLLECTION_NAME']}" \
                  f" --cl {os.environ['EMBEDDING_MONGODB_COLUMN_NAME']}" \
                  f" --sy {year}" \
                  f" --ey {year + 1} " \
                  f"--mnt {os.environ['EMBEDDING_MONGODB_MAX_NUM_TOKENS']} " \
                  f"{model_param} {trees_param} {embeddings_param} {coherence_param}"

            # Run command.
            log.info(f"Running command: '{cmd}'.")
            os.system(cmd)
    log.info(f"Updating NLP fields total execution time = {((time.time() - tokens_initial_time) / 60)} minutes")

    log.debug(f"\n[END OF EXECUTION]")
    final_time = time.time()
    log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutes")