def preprocess_doc(doc):
    doc = preprocessing.tokenize(doc)
    doc = preprocessing.remove_punctuation(doc)
    doc = preprocessing.remove_numbers(doc)
    doc = preprocessing.lower(doc)
    doc = preprocessing.remove_common_stopwords(doc)
    doc = preprocessing.clean_doc(doc)
    return doc
Beispiel #2
0
def preprocess_corpus(documents):
    documents = list(map(preprocessing.tokenize, documents))
    documents = [preprocessing.remove_punctuation(doc) for doc in documents]
    documents = [preprocessing.remove_numbers(doc) for doc in documents]
    documents = [preprocessing.lower(doc) for doc in documents]
    documents = [preprocessing.remove_common_stopwords(doc) for doc in documents]
    documents = [preprocessing.clean_doc(doc) for doc in documents]
    documents = [doc for doc in documents if doc]
    return documents
Beispiel #3
0
def preprocess_doc(row, context=True):
    citation_sentence = str(row['context'])
    if lda_params['markers']:
        citation_sentence = preprocessing.remove_markers(citation_sentence)
    if lda_params['tokenize']:
        citation_sentence = preprocessing.tokenize(citation_sentence)
    if lda_params['pos_tags'] != ():
        tags = preprocessing.lower(
            preprocessing.filter_pos_tags(citation_sentence,
                                          tags=lda_params['pos_tags']))
    if lda_params['punctuation']:
        citation_sentence = preprocessing.remove_punctuation(citation_sentence)
    if lda_params['numbers']:
        citation_sentence = preprocessing.remove_numbers(citation_sentence)
    citation_sentence = preprocessing.lower(citation_sentence)
    if lda_params['bigrams']:
        bigrams = preprocessing.get_bigrams(citation_sentence)
    if lda_params['trigrams']:
        trigrams = preprocessing.get_trigrams(citation_sentence)
    if lda_params['common_stopwords']:
        citation_sentence = preprocessing.remove_common_stopwords(
            citation_sentence)
    if lda_params['custom_stopwords']:
        citation_sentence = preprocessing.remove_custom_stopwords(
            citation_sentence)
    if lda_params['pos_tags'] != ():
        citation_sentence = preprocessing.filter_pos(citation_sentence, tags)
    citation_sentence = preprocessing.clean_doc(citation_sentence)
    if lda_params['bigrams']:
        bigrams = preprocessing.filter_n_grams(bigrams, citation_sentence)
    if lda_params['trigrams']:
        trigrams = preprocessing.filter_n_grams(trigrams, citation_sentence)
    if lda_params['bigrams'] and not lda_params['trigrams']:
        citation_sentence = citation_sentence + bigrams
    if lda_params['trigrams'] and not lda_params['bigrams']:
        citation_sentence = citation_sentence + trigrams
    if lda_params['bigrams'] and lda_params['trigrams']:
        citation_sentence = citation_sentence + bigrams + trigrams
    if lda_params['lemmatize']:
        citation_sentence = preprocessing.lemmatize(citation_sentence)
    citation_sentence = preprocessing.clean_doc(citation_sentence)
    return citation_sentence
Beispiel #4
0
def build_model(documents):
    if lda_params['markers']:
        documents = map(preprocessing.remove_markers, documents)
    if lda_params['tokenize']:
        documents = map(preprocessing.tokenize, documents)
    documents = list(documents)
    if lda_params['pos_tags'] != ():
        tags = [
            preprocessing.lower(
                preprocessing.filter_pos_tags(doc,
                                              tags=lda_params['pos_tags']))
            for doc in documents
        ]
    if lda_params['punctuation']:
        documents = [
            preprocessing.remove_punctuation(doc) for doc in documents
        ]
    if lda_params['numbers']:
        documents = [preprocessing.remove_numbers(doc) for doc in documents]
    documents = [preprocessing.lower(doc) for doc in documents]
    if lda_params['bigrams']:
        bigrams = [preprocessing.get_bigrams(doc) for doc in documents]
    if lda_params['trigrams']:
        trigrams = [preprocessing.get_trigrams(doc) for doc in documents]
    if lda_params['common_stopwords']:
        documents = [
            preprocessing.remove_common_stopwords(doc) for doc in documents
        ]
    if lda_params['custom_stopwords']:
        documents = [
            preprocessing.remove_custom_stopwords(doc) for doc in documents
        ]
    if lda_params['pos_tags'] != ():
        documents = [
            preprocessing.filter_pos(documents[i], tags[i])
            for i in range(0, len(documents))
        ]
    documents = [preprocessing.clean_doc(doc) for doc in documents]
    if lda_params['bigrams']:
        bigrams = [
            preprocessing.filter_n_grams(bigrams[i], documents[i])
            for i in range(0, len(documents))
        ]
    if lda_params['trigrams']:
        trigrams = [
            preprocessing.filter_n_grams(trigrams[i], documents[i])
            for i in range(0, len(documents))
        ]
    if lda_params['bigrams'] and not lda_params['trigrams']:
        documents = [
            documents[i] + bigrams[i] for i in range(0, len(documents))
        ]
    if lda_params['trigrams'] and not lda_params['bigrams']:
        documents = [
            documents[i] + trigrams[i] for i in range(0, len(documents))
        ]
    if lda_params['bigrams'] and lda_params['trigrams']:
        documents = [
            documents[i] + bigrams[i] + trigrams[i]
            for i in range(0, len(documents))
        ]
    if lda_params['lemmatize']:
        documents = [preprocessing.lemmatize(doc) for doc in documents]
    documents = [preprocessing.clean_doc(doc) for doc in documents]
    documents = [doc for doc in documents if doc]

    dictionary = generate_dictionary(documents)
    corpus = generate_corpus(documents, dictionary)
    lda_model = generate_lda_model(corpus, dictionary,
                                   lda_params['num_topics'])

    if not os.path.exists(lda_params['model_dir']):
        os.makedirs(lda_params['model_dir'])
    dictionary.save(lda_params['model_dir'] + 'lda.dict')
    gensim.corpora.MmCorpus.serialize(lda_params['model_dir'] + 'lda.mm',
                                      corpus)
    lda_model.save(lda_params['model_dir'] + 'lda.model')
    with open(lda_params['model_dir'] + 'lda.docs', 'wb') as docs_file:
        pickle.dump(documents, docs_file, pickle.HIGHEST_PROTOCOL)
    with open(lda_params['model_dir'] + 'lda_params.config',
              'w') as config_file:
        config_file.write(str(lda_params))
Beispiel #5
0
def preprocessing(history, current, dollar_df, long, save):
    history = pr.underscore(history, 1)
    current = pr.lower(current, 1)
    #Add new column to separate train and test
    history['train_flag'] = 1
    current['train_flag'] = 0
    #Manually map the historical column names to current column names
    history = history.rename(index=str,
                             columns={
                                 'zipcode': 'addrzip',
                                 'loanamnt': 'loanamount',
                                 'fundedamnt': 'fundedamount',
                                 'verificationstatus': 'isincv',
                                 'verificationstatusjoint': 'isincvjoint',
                                 'numacctsever120pd': 'numacctsever120ppd'
                             })
    #Extract the common column names from history and current
    common_columns = list(set(history.columns) & set(current.columns))
    common_columns.extend(['loanstatus'])
    #Only keep data with common columns
    history = history[common_columns]
    current['loanstatus'] = np.NAN
    current = current[common_columns]
    #Convert some datatype in history
    history.intrate = pr.rstring_to_num(history.intrate, '%', 'float')
    history.revolutil = pr.rstring_to_num(history.revolutil, '%', 'float')
    history.earliestcrline = pr.str_dt_num(history.earliestcrline, '%b-%y', 1)
    history.emplength = pr.emplength_num(history.emplength, 1)
    history.term = history.term.str[:3].astype('int')
    #Conver some datatype in current
    current.earliestcrline = pr.str_dt_num(current.earliestcrline, '%Y-%m-%d',
                                           0)
    current.emplength = pr.emplength_num(current.emplength, 0)
    #Only select 3 yr loans for history and current. Only select fully paid or charged off loans
    history = history[(history.term == 36)
                      & ((history.loanstatus == 'Fully Paid')
                         | (history.loanstatus == 'Charged Off'))]
    history['loanstatus'] = history.loanstatus.map({
        'Fully Paid': 0,
        'Charged Off': 1
    })
    current = current[(current.term == 36)]
    #Combine history and current together. The next conversions will be done in both history and current
    history = history.reset_index().drop(columns='id')
    current = current.reset_index().drop(columns='index')
    total = pd.concat([history, current], axis=0)
    #convert the data from lower_col into lower_keys; get rid of '_' in data from underscore_col
    lower_col = [
        'applicationtype', 'disbursementmethod', 'initialliststatus', 'isincv',
        'isincvjoint'
    ]
    underscore_col = ['isincv', 'isincvjoint']
    total[lower_col] = pr.lower(total[lower_col], 0)
    total[underscore_col] = pr.underscore(total[underscore_col], 0)
    #Convert description into numeric: with description:1; no description:0
    total.desc = pr.desc_num(total.desc)
    #Convert the first three numbers in zipcode to numeric
    total.addrzip = pr.rstring_to_num(total.addrzip, 'x', 'int')
    #If not time stringent, lemmatizer emptitle
    if long == 1:
        total.emptitle = pr.Lemmatizer(total.emptitle)
    #unify some spelling
    total.emptitle = total.emptitle.str.replace('tecnician', 'technitian')
    total.emptitle = total.emptitle.str.replace('registered ', '')
    #frequency encoding 'emptitle', 'addrzip', 'addrstate'
    total = pr.frequency_encoding(total, 'emptitle')
    total = pr.frequency_encoding(total, 'addrzip')
    total = pr.frequency_encoding(total, 'addrstate')
    #ordinal encoding 'grade', 'subgrade'
    total = pr.ordinal_encoding(total, 'grade', 'subgrade')
    #Feature engineer to create a new feature: the absolute remained money after debt paid
    total['remain_income_abs'] = total['annualinc'] * (100 -
                                                       total['dti']) / 100
    total = pd.merge(total, dollar_df, how='left', on='addrstate')
    #Applied adjustment of dollar value to 'annualinc' and 'remain_income_abs'
    total['annualinc'] = total['annualinc'] * total['dollar_value']
    total['remain_income_abs'] = total['remain_income_abs'] * total[
        'dollar_value']
    #sort the column names
    total = total.reindex(sorted(total.columns), axis=1)
    #If an output is needed
    if save == 1:
        total.to_csv(os.path.join(path, 'total_bf_one_hot_encoding.csv'))
    #separate features into three types: numerical, categorical and all_null
    all_null_feature_h, num_feature_h, ob_feature_h = pr.feature_separation(
        total, total.columns)
    #Remove 'addrstate', 'emptitle', and 'secappearliestcrline' from categorical feature list
    ob_feature_h.remove('addrstate')
    ob_feature_h.remove('emptitle')
    ob_feature_h.remove('secappearliestcrline')
    #sort the column names
    total = total.reindex(sorted(total.columns), axis=1)
    #If length matches, one hot encoding the rest categorical features
    if (len(ob_feature_h) + len(num_feature_h) + 4) == len(total.columns):
        total = pd.concat(
            [total[num_feature_h],
             pd.get_dummies(total[ob_feature_h])],
            axis=1)
        if save == 1:
            total.to_csv(os.path.join(path, 'total_one_hot_encoded.csv'))
    return total