def _clean_dataset(path_data='', list_footers=[]): '''Method to exctract the base information from emls. Parameters: -path_data (string): The subfolders of mailbox. Return: -df_base (Dataframe): Dataframe base complete. ''' # Load file data_file_path = [f for f in glob(path_data + "**/*.csv", recursive=True)][0] df_enron = pd.read_csv(data_file_path, sep=",", error_bad_lines=False, index_col=0) # Filter by the information we need it df_base = df_enron[['From', 'To', 'content']] logger.debug("Cleaning the dataset") # Clean NA values df_base = df_base.fillna('') # Clean From and To df_base['from'] = df_base['From'].apply(lambda x: remove_tags_sender(x)) df_base = df_base.drop('From', axis=1) df_base['to'] = df_base['To'].apply(lambda x: remove_tags_sender(x)) df_base = df_base.drop('To', axis=1) # Get body df_base['body'] = df_base['content'] df_base = df_base.drop('content', axis=1) logger.debug("Getting the clean last conversation") df_base['body_latest'] = df_base.apply( lambda x: _get_last_conversation(x["body"], list_footers), axis=1) return df_base
def lemmatize_bodies(df_base, nlp_model): '''Method to lemmatize the bodies of the emails. Parameters: -df_base (DataFrame): The base dataframe. Return: -df_base (DataFrame): The base dataframe with the bodies lemmatized ''' # Load model nlp = nlp_model['english'] logger.debug("Creating body lemma.. ") # Lemmatize body df_base['body_lemma'] = df_base.apply(lambda x: _lemmatize( x["body_latest"], "english", nlp, ), axis=1) return df_base
def remove_footers(body, list_footers): """Method to get remove the footers of the email body. Please don't modify them. Args: body (String): the string body of email. list_footers (List): the list of the possible footers. Returns: (str): the email body without footers. """ if list_footers: for foot in list_footers: try: match = re.search(re.escape(foot), body, re.IGNORECASE) if match and match[0] != "": (start,_)= match.span() body = body[:start] except Exception as exception: logger.debug("Something occurred during footer removal") logger.debug(exception) return body
def train_lda(df, search_params, dir_model): '''Method to train a lda model. Parameters: -df (DataFrame): The base dataframe. -search_params (Dict): Dictionary with LDA params -dir_model (String): model folder path ''' logger.debug("Number of rows in df: " + str(len(df))) # Remove duplicates df = df.drop_duplicates(subset="body_lemma", keep="first") logger.debug("Number of rows without duplicates in df: " + str(len(df))) logger.info("Getting best lda model...") best_lda_model, count_vectorizer, count_data = _best_model( df, search_params) # Save counter vectorizer # Save lda model logger.debug("Saving counter vectorizer...") save_pickle(count_vectorizer, "count_vectorizer", dir_model) # Save lda model logger.info("Saving lda model...") save_pickle(best_lda_model, "lda_model", dir_model) logger.info("Finished training LDA model")
def preprocess_data(df_cleaned, dir_tmp='', path_data=''): '''Method to extract all value information from all emls. Parameters: -dir_tmp (String): The temporal path. -path_data (List): The data path. Return: -df_base (Dataframe): Dataframe complete. ''' time1 = time.time() current_time = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.gmtime(time1)) logger.debug("Execution preprocessing data started at " + current_time) logger.info("Loading nlp dictionary..") nlp_model = checkpoint(func=get_nlp_model_dict, func_args=(), func_kwargs={}, suffix=get_hex_hash_params(dir_tmp), save_checkpoint=True, tmp_path=dir_tmp.encode(), suffix_description='nlp_model') # Body lemmatization logger.info("Lemmatizing bodies of emails..") df_base = checkpoint(func=lemmatize_bodies, func_args=(df_cleaned, nlp_model), func_kwargs={}, suffix=get_hex_hash_params(dir_tmp), save_checkpoint=True, tmp_path=dir_tmp.encode(), suffix_description='df_base_lemmatized') time2 = time.time() duration_time = time.strftime("%Hh %Mm %Ss", time.gmtime(time2 - time1)) logger.debug("Preprocessing data ended at " + duration_time) return df_base
def _best_model(df, search_params): '''Method to get the best parameters to the best lda model. Parameters: -df (DataFrame): The base dataframe. -search_params (Dict): Dictionary with LDA params Return: -best_lda_model (LDA): The best lda model -count_vectorizer (CountVectorizer): The count vectorizer model -count_data (CountVectorizer) ''' # Init model lda = LDA() # Init GrinSearchCV model = GridSearchCV(lda, param_grid=search_params, n_jobs=8) logger.debug("Initialize the count vectorizer...") # Initialize the count vectorizer with the English stop words count_vectorizer = CountVectorizer(stop_words='english') # Fit and transform the processed titles count_data = count_vectorizer.fit_transform(df['body_lemma']) logger.info("Training model...") model.fit(count_data) # Best model best_lda_model = model.best_estimator_ # Model parameters logger.debug("Best model's params: {}".format(str(model.best_params_))) # Log likelihood score logger.debug("Best log likelihood score: {}".format(str( model.best_score_))) # Perplexity logger.debug("Model perplexity: {}".format( str(best_lda_model.perplexity(count_data)))) return best_lda_model, count_vectorizer, count_data
df_cleaned = clean_data(dir_tmp=dir_tmp, path_data=dir_data) df_base = preprocess_data(df_cleaned, dir_tmp=dir_tmp, path_data=dir_data) # Create trained model folder dir_model = os.path.join(dir_output, "model_trained") print(dir_model) print(dir_output) print(config['n_top_words']) # If its train if config['train']: if not os.path.isdir(dir_model): os.makedirs(dir_model) logger.debug("Calculating best LDA model..") search_params = { "n_components": list(range(5, 16)), "learning_decay": [.9] } logger.debug("Search parameters used for GridSearch: " + str(search_params)) # Train train_lda(df_base, search_params, dir_model) # If its predict if config['predict']: # Predict predict(config['n_top_words'], dir_model, dir_output) time_end = time.time()