Esempio n. 1
0
def _clean_dataset(path_data='', list_footers=[]):
    '''Method to exctract the base information from emls.
    Parameters:
        -path_data (string): The subfolders of mailbox.
    Return:
        -df_base (Dataframe): Dataframe base complete.
    '''
    # Load file
    data_file_path = [f
                      for f in glob(path_data + "**/*.csv", recursive=True)][0]
    df_enron = pd.read_csv(data_file_path,
                           sep=",",
                           error_bad_lines=False,
                           index_col=0)
    # Filter by the information we need it
    df_base = df_enron[['From', 'To', 'content']]

    logger.debug("Cleaning the dataset")
    # Clean NA values
    df_base = df_base.fillna('')
    # Clean From and To
    df_base['from'] = df_base['From'].apply(lambda x: remove_tags_sender(x))
    df_base = df_base.drop('From', axis=1)
    df_base['to'] = df_base['To'].apply(lambda x: remove_tags_sender(x))
    df_base = df_base.drop('To', axis=1)
    # Get body
    df_base['body'] = df_base['content']
    df_base = df_base.drop('content', axis=1)

    logger.debug("Getting the clean last conversation")
    df_base['body_latest'] = df_base.apply(
        lambda x: _get_last_conversation(x["body"], list_footers), axis=1)

    return df_base
Esempio n. 2
0
def lemmatize_bodies(df_base, nlp_model):
    '''Method to lemmatize the bodies of the emails.
    Parameters:
        -df_base (DataFrame): The base dataframe.
    Return:
        -df_base (DataFrame): The base dataframe with the bodies lemmatized
    '''
    # Load model
    nlp = nlp_model['english']
    logger.debug("Creating body lemma.. ")
    # Lemmatize body
    df_base['body_lemma'] = df_base.apply(lambda x: _lemmatize(
        x["body_latest"],
        "english",
        nlp,
    ),
                                          axis=1)
    return df_base
Esempio n. 3
0
def remove_footers(body, list_footers):
    """Method to get remove the footers of the email body. Please don't modify them.
    Args:
        body (String): the string body of email.
        list_footers (List): the list of the possible footers.
    Returns:
        (str): the email body without footers.
    """
    if list_footers:
        for foot in list_footers:
            try:
                match = re.search(re.escape(foot), body, re.IGNORECASE)
                if match and match[0] != "":
                    (start,_)= match.span()
                    body = body[:start]
            except Exception as exception:
                logger.debug("Something occurred during footer removal")
                logger.debug(exception)
    return body
Esempio n. 4
0
def train_lda(df, search_params, dir_model):
    '''Method to train a lda model.
        Parameters:
                -df (DataFrame): The base dataframe.
                -search_params (Dict): Dictionary with LDA params
                -dir_model (String): model folder path
        '''
    logger.debug("Number of rows in df: " + str(len(df)))
    # Remove duplicates
    df = df.drop_duplicates(subset="body_lemma", keep="first")
    logger.debug("Number of rows without duplicates in df: " + str(len(df)))

    logger.info("Getting best lda model...")
    best_lda_model, count_vectorizer, count_data = _best_model(
        df, search_params)

    # Save counter vectorizer
    # Save lda model
    logger.debug("Saving counter vectorizer...")
    save_pickle(count_vectorizer, "count_vectorizer", dir_model)

    # Save lda model
    logger.info("Saving lda model...")
    save_pickle(best_lda_model, "lda_model", dir_model)

    logger.info("Finished training LDA model")
Esempio n. 5
0
def preprocess_data(df_cleaned, dir_tmp='', path_data=''):
    '''Method to extract all value information from all emls.
    Parameters:
        -dir_tmp (String): The temporal path.
        -path_data (List): The data path.
    Return:
        -df_base (Dataframe): Dataframe complete.
    '''
    time1 = time.time()
    current_time = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.gmtime(time1))
    logger.debug("Execution preprocessing data started at " + current_time)

    logger.info("Loading nlp dictionary..")
    nlp_model = checkpoint(func=get_nlp_model_dict,
                           func_args=(),
                           func_kwargs={},
                           suffix=get_hex_hash_params(dir_tmp),
                           save_checkpoint=True,
                           tmp_path=dir_tmp.encode(),
                           suffix_description='nlp_model')

    # Body lemmatization
    logger.info("Lemmatizing bodies of emails..")
    df_base = checkpoint(func=lemmatize_bodies,
                         func_args=(df_cleaned, nlp_model),
                         func_kwargs={},
                         suffix=get_hex_hash_params(dir_tmp),
                         save_checkpoint=True,
                         tmp_path=dir_tmp.encode(),
                         suffix_description='df_base_lemmatized')

    time2 = time.time()
    duration_time = time.strftime("%Hh %Mm %Ss", time.gmtime(time2 - time1))
    logger.debug("Preprocessing data ended at " + duration_time)

    return df_base
Esempio n. 6
0
def _best_model(df, search_params):
    '''Method to get the best parameters to the best lda model.
        Parameters:
                -df (DataFrame): The base dataframe.
                -search_params (Dict): Dictionary with LDA params
        Return:
                -best_lda_model (LDA): The best lda model
                -count_vectorizer (CountVectorizer): The count vectorizer model
                -count_data (CountVectorizer)
        '''
    # Init model
    lda = LDA()
    # Init GrinSearchCV
    model = GridSearchCV(lda, param_grid=search_params, n_jobs=8)

    logger.debug("Initialize the count vectorizer...")
    # Initialize the count vectorizer with the English stop words
    count_vectorizer = CountVectorizer(stop_words='english')
    # Fit and transform the processed titles
    count_data = count_vectorizer.fit_transform(df['body_lemma'])

    logger.info("Training model...")
    model.fit(count_data)

    # Best model
    best_lda_model = model.best_estimator_
    # Model parameters
    logger.debug("Best model's params: {}".format(str(model.best_params_)))
    # Log likelihood score
    logger.debug("Best log likelihood score: {}".format(str(
        model.best_score_)))
    # Perplexity
    logger.debug("Model perplexity: {}".format(
        str(best_lda_model.perplexity(count_data))))

    return best_lda_model, count_vectorizer, count_data
Esempio n. 7
0
df_cleaned = clean_data(dir_tmp=dir_tmp, path_data=dir_data)
df_base = preprocess_data(df_cleaned, dir_tmp=dir_tmp, path_data=dir_data)

# Create trained model folder
dir_model = os.path.join(dir_output, "model_trained")
print(dir_model)
print(dir_output)
print(config['n_top_words'])

# If its train
if config['train']:

    if not os.path.isdir(dir_model):
        os.makedirs(dir_model)

    logger.debug("Calculating best LDA model..")
    search_params = {
        "n_components": list(range(5, 16)),
        "learning_decay": [.9]
    }
    logger.debug("Search parameters used for GridSearch: " +
                 str(search_params))
    # Train
    train_lda(df_base, search_params, dir_model)

# If its predict
if config['predict']:
    # Predict
    predict(config['n_top_words'], dir_model, dir_output)

time_end = time.time()