def get_folder_structure(root_path="./", config_fname="config.json"): """Gets and checks the paths the execution folder strucutre. The names of the folder are a conventiong. Please don't modify them. Args: root_path (str, optional): root path of the folder stcuture. Defaults to ./ config_fname (str, optional): config filename. Defaults to config.json Returns: (str): dir_data, dir_output, dir_tmp, config_path """ if not isinstance(root_path, str): raise TypeError("root_path must be str") if not isinstance(config_fname, str): raise TypeError("config_fname must be str") dir_data = os.path.join(root_path, "data/") dir_output = os.path.join(root_path, "output/") dir_tmp = os.path.join(root_path, "tmp/") config_path = os.path.join(root_path, "config/", config_fname) dir_data = os.path.normpath(dir_data) dir_output = os.path.normpath(dir_output) dir_tmp = os.path.normpath(dir_tmp) config_path = os.path.normpath(config_path) if not os.path.isdir(dir_data): raise Exception("path of input data does not exist: {}".format(dir_data)) if not os.path.isdir(dir_output): raise Exception("path of output data does not exist: {}".format(dir_output)) if not os.path.isdir(dir_tmp): raise Exception("path of tmp data does not exist: {}".format(dir_tmp)) if not os.path.isfile(config_path): logger.info("there is no config file") config_path = None return dir_data, dir_output, dir_tmp, config_path
def read_config(config_path): """Reads the config json file. Args: config_path (str or None): Path to the config json file. Raises: Exception: If the config file does not exist. Exception: If the config file is corrupted. Returns: dict: Config file. """ if config_path: try: config = json.load(open(config_path, "rb")) except FileNotFoundError: logger.error("config file does not exist") raise Exception( "config file {} does not exist".format(config_path)) except JSONDecodeError: logger.error( "issue with config file: file is not in a valid json format. Check if variables are enclosed in double quotes or booleans are in the right format" ) raise Exception("config file {} has issues".format(config_path)) except: logger.error("issue with config file format") raise Exception( "could not load config file {}".format(config_path)) else: config = {} if not config: logger.info("The config file is empty") return config
def get_list_footers(): """Method to reed the footers file. Please don't modify them. Args: Returns: (list): list with possible footers """ try: list_of_footers_path = LIST_OF_FOOTERS_PATH with open(list_of_footers_path, "r", encoding = "utf-8") as f: return [x for x in f.read().split("\n") if x] except: logger.info("No list of footers found, will not remove footers from body of emails") return []
def train_lda(df, search_params, dir_model): '''Method to train a lda model. Parameters: -df (DataFrame): The base dataframe. -search_params (Dict): Dictionary with LDA params -dir_model (String): model folder path ''' logger.debug("Number of rows in df: " + str(len(df))) # Remove duplicates df = df.drop_duplicates(subset="body_lemma", keep="first") logger.debug("Number of rows without duplicates in df: " + str(len(df))) logger.info("Getting best lda model...") best_lda_model, count_vectorizer, count_data = _best_model( df, search_params) # Save counter vectorizer # Save lda model logger.debug("Saving counter vectorizer...") save_pickle(count_vectorizer, "count_vectorizer", dir_model) # Save lda model logger.info("Saving lda model...") save_pickle(best_lda_model, "lda_model", dir_model) logger.info("Finished training LDA model")
def predict(n_top_words, dir_model, dir_output): '''Method to train a lda model. Parameters: -n_top_words (Int): number of words per topic -dir_model (String): model folder path -dir_output (String): model folder path ''' if not os.path.isdir(dir_model): logger.error('The model folder doesnt exist.') exit(1) if not os.path.isfile(dir_model + '/lda_model.p'): logger.error('The lda model doesnt exist.') exit(1) else: lda_model = pickle.load(open(dir_model + '/lda_model.p', 'rb')) if not os.path.isfile(dir_model + '/count_vectorizer.p'): logger.error('The count vectorizer model doesnt exist.') exit(1) else: count_vectorizer = pickle.load( open(dir_model + '/count_vectorizer.p', 'rb')) logger.info("Loading lda model...") lda_model = pickle.load(open(dir_model + '/lda_model.p', 'rb')) count_vectorizer = pickle.load( open(dir_model + '/count_vectorizer.p', 'rb')) logger.info("Calculating topics...") df_topics = _get_topics(lda_model, count_vectorizer, n_top_words) logger.info("Saving topics...") save_as_excel(df_topics, "topics", dir_output)
def clean_data(dir_tmp='', path_data=''): '''Method to extract all value information from all emls. Parameters: -dir_tmp (String): The temporal path. -path_data (List): The data path. Return: -df_cleaned (Dataframe): clean dataframe. ''' logger.info("Loading list of footers..") list_footers = get_list_footers() # Create the dataframe base logger.info( "Extracting and cleaning the information from the body emails..") df_cleaned = checkpoint(func=_clean_dataset, func_args=(path_data, list_footers), func_kwargs={}, suffix=get_hex_hash_params(dir_tmp), save_checkpoint=True, tmp_path=dir_tmp.encode(), suffix_description='df_base') return df_cleaned
def preprocess_data(df_cleaned, dir_tmp='', path_data=''): '''Method to extract all value information from all emls. Parameters: -dir_tmp (String): The temporal path. -path_data (List): The data path. Return: -df_base (Dataframe): Dataframe complete. ''' time1 = time.time() current_time = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.gmtime(time1)) logger.debug("Execution preprocessing data started at " + current_time) logger.info("Loading nlp dictionary..") nlp_model = checkpoint(func=get_nlp_model_dict, func_args=(), func_kwargs={}, suffix=get_hex_hash_params(dir_tmp), save_checkpoint=True, tmp_path=dir_tmp.encode(), suffix_description='nlp_model') # Body lemmatization logger.info("Lemmatizing bodies of emails..") df_base = checkpoint(func=lemmatize_bodies, func_args=(df_cleaned, nlp_model), func_kwargs={}, suffix=get_hex_hash_params(dir_tmp), save_checkpoint=True, tmp_path=dir_tmp.encode(), suffix_description='df_base_lemmatized') time2 = time.time() duration_time = time.strftime("%Hh %Mm %Ss", time.gmtime(time2 - time1)) logger.debug("Preprocessing data ended at " + duration_time) return df_base
def get_readme_file(user_name, repo_name): # Get a list of URLs at which the readme content *might* exist url_list = [ get_readme_url(user_name, repo_name, "README.md"), get_readme_url(user_name, repo_name, "README.markdown"), get_readme_url(user_name, repo_name, "readme.md"), ] content = get_content_first_url(url_list, get_url_content) csvWriter = csv.writer(open(readmes_csv_path, 'ab')) if content != False: csvWriter.writerow([user_name, repo_name, content]) message = "SUCCESS | user: "******" | repo: " + repo_name logger.info(message) return True else: message = "ERROR | user: "******" | repo: " + repo_name logger.error(message) csvWriter.writerow([user_name, repo_name, "NULL"]) return False
def _best_model(df, search_params): '''Method to get the best parameters to the best lda model. Parameters: -df (DataFrame): The base dataframe. -search_params (Dict): Dictionary with LDA params Return: -best_lda_model (LDA): The best lda model -count_vectorizer (CountVectorizer): The count vectorizer model -count_data (CountVectorizer) ''' # Init model lda = LDA() # Init GrinSearchCV model = GridSearchCV(lda, param_grid=search_params, n_jobs=8) logger.debug("Initialize the count vectorizer...") # Initialize the count vectorizer with the English stop words count_vectorizer = CountVectorizer(stop_words='english') # Fit and transform the processed titles count_data = count_vectorizer.fit_transform(df['body_lemma']) logger.info("Training model...") model.fit(count_data) # Best model best_lda_model = model.best_estimator_ # Model parameters logger.debug("Best model's params: {}".format(str(model.best_params_))) # Log likelihood score logger.debug("Best log likelihood score: {}".format(str( model.best_score_))) # Perplexity logger.debug("Model perplexity: {}".format( str(best_lda_model.perplexity(count_data)))) return best_lda_model, count_vectorizer, count_data
""" import os import time # MODULES from global_vars import ROOT_PATH, CONFIG_NAME from config import get_schema, read_config, validate_config from get_logger import config_logger, logger from utils import get_folder_structure from preprocess_data import clean_data, preprocess_data from lda_model import train_lda, predict config_logger(ROOT_PATH) time_start = time.time() current_time = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.gmtime(time_start)) logger.info("Execution started at " + current_time) # Get paths dir_data, dir_output, dir_tmp, config_path = get_folder_structure(root_path=ROOT_PATH, \ config_fname=CONFIG_NAME) logger.info("Validation config file..") # Load config schema = get_schema() config = read_config(config_path=config_path) config = validate_config(config=config, schema=schema) # Clean data df_cleaned = clean_data(dir_tmp=dir_tmp, path_data=dir_data) df_base = preprocess_data(df_cleaned, dir_tmp=dir_tmp, path_data=dir_data)