Beispiel #1
0
def get_folder_structure(root_path="./", config_fname="config.json"):
    """Gets and checks the paths the execution folder strucutre. The names of the folder
        are a conventiong. Please don't modify them.
    Args:
        root_path (str, optional): root path of the folder stcuture. Defaults to ./
        config_fname (str, optional): config filename. Defaults to config.json
    Returns:
        (str): dir_data, dir_output, dir_tmp, config_path
    """
    if not isinstance(root_path, str):
        raise TypeError("root_path must be str")
    if not isinstance(config_fname, str):
        raise TypeError("config_fname must be str")
    dir_data = os.path.join(root_path, "data/")
    dir_output = os.path.join(root_path, "output/")
    dir_tmp = os.path.join(root_path, "tmp/")
    config_path = os.path.join(root_path, "config/", config_fname)
    dir_data = os.path.normpath(dir_data)
    dir_output = os.path.normpath(dir_output)
    dir_tmp = os.path.normpath(dir_tmp)
    config_path = os.path.normpath(config_path)
    if not os.path.isdir(dir_data):
        raise Exception("path of input data does not exist: {}".format(dir_data))
    if not os.path.isdir(dir_output):
        raise Exception("path of output data does not exist: {}".format(dir_output))
    if not os.path.isdir(dir_tmp):
        raise Exception("path of tmp data does not exist: {}".format(dir_tmp))
    if not os.path.isfile(config_path):
        logger.info("there is no config file")
        config_path = None
    return dir_data, dir_output, dir_tmp, config_path
Beispiel #2
0
def read_config(config_path):
    """Reads the config json file.
    Args:
        config_path (str or None): Path to the config json file.
    Raises:
        Exception: If the config file does not exist.
        Exception: If the config file is corrupted.
    Returns:
        dict: Config file.
    """
    if config_path:
        try:
            config = json.load(open(config_path, "rb"))
        except FileNotFoundError:
            logger.error("config file does not exist")
            raise Exception(
                "config file {} does not exist".format(config_path))
        except JSONDecodeError:
            logger.error(
                "issue with config file: file is not in a valid json format. Check if variables are enclosed in double quotes or booleans are in the right format"
            )
            raise Exception("config file {} has issues".format(config_path))
        except:
            logger.error("issue with config file format")
            raise Exception(
                "could not load config file {}".format(config_path))
    else:
        config = {}
    if not config:
        logger.info("The config file is empty")
    return config
Beispiel #3
0
def get_list_footers():
    """Method to reed the footers file. Please don't modify them.
    Args:
    
    Returns:
        (list): list with possible footers
    """
    try:
        list_of_footers_path = LIST_OF_FOOTERS_PATH
        with open(list_of_footers_path, "r", encoding = "utf-8") as f:
            return [x for x in f.read().split("\n") if x]
    except:
        logger.info("No list of footers found, will not remove footers from body of emails")
        return []
Beispiel #4
0
def train_lda(df, search_params, dir_model):
    '''Method to train a lda model.
        Parameters:
                -df (DataFrame): The base dataframe.
                -search_params (Dict): Dictionary with LDA params
                -dir_model (String): model folder path
        '''
    logger.debug("Number of rows in df: " + str(len(df)))
    # Remove duplicates
    df = df.drop_duplicates(subset="body_lemma", keep="first")
    logger.debug("Number of rows without duplicates in df: " + str(len(df)))

    logger.info("Getting best lda model...")
    best_lda_model, count_vectorizer, count_data = _best_model(
        df, search_params)

    # Save counter vectorizer
    # Save lda model
    logger.debug("Saving counter vectorizer...")
    save_pickle(count_vectorizer, "count_vectorizer", dir_model)

    # Save lda model
    logger.info("Saving lda model...")
    save_pickle(best_lda_model, "lda_model", dir_model)

    logger.info("Finished training LDA model")
Beispiel #5
0
def predict(n_top_words, dir_model, dir_output):
    '''Method to train a lda model.
        Parameters:
                -n_top_words (Int): number of words per topic
                -dir_model (String): model folder path
                -dir_output (String): model folder path
        '''
    if not os.path.isdir(dir_model):
        logger.error('The model folder doesnt exist.')
        exit(1)

    if not os.path.isfile(dir_model + '/lda_model.p'):
        logger.error('The lda model doesnt exist.')
        exit(1)
    else:
        lda_model = pickle.load(open(dir_model + '/lda_model.p', 'rb'))

    if not os.path.isfile(dir_model + '/count_vectorizer.p'):
        logger.error('The count vectorizer model doesnt exist.')
        exit(1)
    else:
        count_vectorizer = pickle.load(
            open(dir_model + '/count_vectorizer.p', 'rb'))

    logger.info("Loading lda model...")
    lda_model = pickle.load(open(dir_model + '/lda_model.p', 'rb'))
    count_vectorizer = pickle.load(
        open(dir_model + '/count_vectorizer.p', 'rb'))

    logger.info("Calculating topics...")
    df_topics = _get_topics(lda_model, count_vectorizer, n_top_words)

    logger.info("Saving topics...")
    save_as_excel(df_topics, "topics", dir_output)
Beispiel #6
0
def clean_data(dir_tmp='', path_data=''):
    '''Method to extract all value information from all emls.
    Parameters:
        -dir_tmp (String): The temporal path.
        -path_data (List): The data path.
    Return:
        -df_cleaned (Dataframe): clean dataframe.
    '''
    logger.info("Loading list of footers..")
    list_footers = get_list_footers()

    # Create the dataframe base
    logger.info(
        "Extracting and cleaning the information from the body emails..")
    df_cleaned = checkpoint(func=_clean_dataset,
                            func_args=(path_data, list_footers),
                            func_kwargs={},
                            suffix=get_hex_hash_params(dir_tmp),
                            save_checkpoint=True,
                            tmp_path=dir_tmp.encode(),
                            suffix_description='df_base')
    return df_cleaned
Beispiel #7
0
def preprocess_data(df_cleaned, dir_tmp='', path_data=''):
    '''Method to extract all value information from all emls.
    Parameters:
        -dir_tmp (String): The temporal path.
        -path_data (List): The data path.
    Return:
        -df_base (Dataframe): Dataframe complete.
    '''
    time1 = time.time()
    current_time = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.gmtime(time1))
    logger.debug("Execution preprocessing data started at " + current_time)

    logger.info("Loading nlp dictionary..")
    nlp_model = checkpoint(func=get_nlp_model_dict,
                           func_args=(),
                           func_kwargs={},
                           suffix=get_hex_hash_params(dir_tmp),
                           save_checkpoint=True,
                           tmp_path=dir_tmp.encode(),
                           suffix_description='nlp_model')

    # Body lemmatization
    logger.info("Lemmatizing bodies of emails..")
    df_base = checkpoint(func=lemmatize_bodies,
                         func_args=(df_cleaned, nlp_model),
                         func_kwargs={},
                         suffix=get_hex_hash_params(dir_tmp),
                         save_checkpoint=True,
                         tmp_path=dir_tmp.encode(),
                         suffix_description='df_base_lemmatized')

    time2 = time.time()
    duration_time = time.strftime("%Hh %Mm %Ss", time.gmtime(time2 - time1))
    logger.debug("Preprocessing data ended at " + duration_time)

    return df_base
Beispiel #8
0
def get_readme_file(user_name, repo_name):
    # Get a list of URLs at which the readme content *might* exist
    url_list = [
        get_readme_url(user_name, repo_name, "README.md"),
        get_readme_url(user_name, repo_name, "README.markdown"),
        get_readme_url(user_name, repo_name, "readme.md"),
    ]

    content = get_content_first_url(url_list, get_url_content)
    csvWriter = csv.writer(open(readmes_csv_path, 'ab'))

    if content != False:
        csvWriter.writerow([user_name, repo_name, content])
        message = "SUCCESS | user: "******" | repo: " + repo_name
        logger.info(message)

        return True

    else:
        message = "ERROR | user: "******" | repo: " + repo_name
        logger.error(message)
        csvWriter.writerow([user_name, repo_name, "NULL"])

        return False
Beispiel #9
0
def _best_model(df, search_params):
    '''Method to get the best parameters to the best lda model.
        Parameters:
                -df (DataFrame): The base dataframe.
                -search_params (Dict): Dictionary with LDA params
        Return:
                -best_lda_model (LDA): The best lda model
                -count_vectorizer (CountVectorizer): The count vectorizer model
                -count_data (CountVectorizer)
        '''
    # Init model
    lda = LDA()
    # Init GrinSearchCV
    model = GridSearchCV(lda, param_grid=search_params, n_jobs=8)

    logger.debug("Initialize the count vectorizer...")
    # Initialize the count vectorizer with the English stop words
    count_vectorizer = CountVectorizer(stop_words='english')
    # Fit and transform the processed titles
    count_data = count_vectorizer.fit_transform(df['body_lemma'])

    logger.info("Training model...")
    model.fit(count_data)

    # Best model
    best_lda_model = model.best_estimator_
    # Model parameters
    logger.debug("Best model's params: {}".format(str(model.best_params_)))
    # Log likelihood score
    logger.debug("Best log likelihood score: {}".format(str(
        model.best_score_)))
    # Perplexity
    logger.debug("Model perplexity: {}".format(
        str(best_lda_model.perplexity(count_data))))

    return best_lda_model, count_vectorizer, count_data
Beispiel #10
0
"""
import os
import time

# MODULES
from global_vars import ROOT_PATH, CONFIG_NAME
from config import get_schema, read_config, validate_config
from get_logger import config_logger, logger
from utils import get_folder_structure
from preprocess_data import clean_data, preprocess_data
from lda_model import train_lda, predict

config_logger(ROOT_PATH)
time_start = time.time()
current_time = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.gmtime(time_start))
logger.info("Execution started at " + current_time)

# Get paths
dir_data, dir_output, dir_tmp, config_path = get_folder_structure(root_path=ROOT_PATH, \
                                                                  config_fname=CONFIG_NAME)

logger.info("Validation config file..")
# Load config
schema = get_schema()
config = read_config(config_path=config_path)
config = validate_config(config=config, schema=schema)

# Clean data
df_cleaned = clean_data(dir_tmp=dir_tmp, path_data=dir_data)
df_base = preprocess_data(df_cleaned, dir_tmp=dir_tmp, path_data=dir_data)