def preprocess_doc(doc): doc = preprocessing.tokenize(doc) doc = preprocessing.remove_punctuation(doc) doc = preprocessing.remove_numbers(doc) doc = preprocessing.lower(doc) doc = preprocessing.remove_common_stopwords(doc) doc = preprocessing.clean_doc(doc) return doc
def preprocess_corpus(documents): documents = list(map(preprocessing.tokenize, documents)) documents = [preprocessing.remove_punctuation(doc) for doc in documents] documents = [preprocessing.remove_numbers(doc) for doc in documents] documents = [preprocessing.lower(doc) for doc in documents] documents = [preprocessing.remove_common_stopwords(doc) for doc in documents] documents = [preprocessing.clean_doc(doc) for doc in documents] documents = [doc for doc in documents if doc] return documents
def preprocess_doc(row, context=True): citation_sentence = str(row['context']) if lda_params['markers']: citation_sentence = preprocessing.remove_markers(citation_sentence) if lda_params['tokenize']: citation_sentence = preprocessing.tokenize(citation_sentence) if lda_params['pos_tags'] != (): tags = preprocessing.lower( preprocessing.filter_pos_tags(citation_sentence, tags=lda_params['pos_tags'])) if lda_params['punctuation']: citation_sentence = preprocessing.remove_punctuation(citation_sentence) if lda_params['numbers']: citation_sentence = preprocessing.remove_numbers(citation_sentence) citation_sentence = preprocessing.lower(citation_sentence) if lda_params['bigrams']: bigrams = preprocessing.get_bigrams(citation_sentence) if lda_params['trigrams']: trigrams = preprocessing.get_trigrams(citation_sentence) if lda_params['common_stopwords']: citation_sentence = preprocessing.remove_common_stopwords( citation_sentence) if lda_params['custom_stopwords']: citation_sentence = preprocessing.remove_custom_stopwords( citation_sentence) if lda_params['pos_tags'] != (): citation_sentence = preprocessing.filter_pos(citation_sentence, tags) citation_sentence = preprocessing.clean_doc(citation_sentence) if lda_params['bigrams']: bigrams = preprocessing.filter_n_grams(bigrams, citation_sentence) if lda_params['trigrams']: trigrams = preprocessing.filter_n_grams(trigrams, citation_sentence) if lda_params['bigrams'] and not lda_params['trigrams']: citation_sentence = citation_sentence + bigrams if lda_params['trigrams'] and not lda_params['bigrams']: citation_sentence = citation_sentence + trigrams if lda_params['bigrams'] and lda_params['trigrams']: citation_sentence = citation_sentence + bigrams + trigrams if lda_params['lemmatize']: citation_sentence = preprocessing.lemmatize(citation_sentence) citation_sentence = preprocessing.clean_doc(citation_sentence) return citation_sentence
def build_model(documents): if lda_params['markers']: documents = map(preprocessing.remove_markers, documents) if lda_params['tokenize']: documents = map(preprocessing.tokenize, documents) documents = list(documents) if lda_params['pos_tags'] != (): tags = [ preprocessing.lower( preprocessing.filter_pos_tags(doc, tags=lda_params['pos_tags'])) for doc in documents ] if lda_params['punctuation']: documents = [ preprocessing.remove_punctuation(doc) for doc in documents ] if lda_params['numbers']: documents = [preprocessing.remove_numbers(doc) for doc in documents] documents = [preprocessing.lower(doc) for doc in documents] if lda_params['bigrams']: bigrams = [preprocessing.get_bigrams(doc) for doc in documents] if lda_params['trigrams']: trigrams = [preprocessing.get_trigrams(doc) for doc in documents] if lda_params['common_stopwords']: documents = [ preprocessing.remove_common_stopwords(doc) for doc in documents ] if lda_params['custom_stopwords']: documents = [ preprocessing.remove_custom_stopwords(doc) for doc in documents ] if lda_params['pos_tags'] != (): documents = [ preprocessing.filter_pos(documents[i], tags[i]) for i in range(0, len(documents)) ] documents = [preprocessing.clean_doc(doc) for doc in documents] if lda_params['bigrams']: bigrams = [ preprocessing.filter_n_grams(bigrams[i], documents[i]) for i in range(0, len(documents)) ] if lda_params['trigrams']: trigrams = [ preprocessing.filter_n_grams(trigrams[i], documents[i]) for i in range(0, len(documents)) ] if lda_params['bigrams'] and not lda_params['trigrams']: documents = [ documents[i] + bigrams[i] for i in range(0, len(documents)) ] if lda_params['trigrams'] and not lda_params['bigrams']: documents = [ documents[i] + trigrams[i] for i in range(0, len(documents)) ] if lda_params['bigrams'] and lda_params['trigrams']: documents = [ documents[i] + bigrams[i] + trigrams[i] for i in range(0, len(documents)) ] if lda_params['lemmatize']: documents = [preprocessing.lemmatize(doc) for doc in documents] documents = [preprocessing.clean_doc(doc) for doc in documents] documents = [doc for doc in documents if doc] dictionary = generate_dictionary(documents) corpus = generate_corpus(documents, dictionary) lda_model = generate_lda_model(corpus, dictionary, lda_params['num_topics']) if not os.path.exists(lda_params['model_dir']): os.makedirs(lda_params['model_dir']) dictionary.save(lda_params['model_dir'] + 'lda.dict') gensim.corpora.MmCorpus.serialize(lda_params['model_dir'] + 'lda.mm', corpus) lda_model.save(lda_params['model_dir'] + 'lda.model') with open(lda_params['model_dir'] + 'lda.docs', 'wb') as docs_file: pickle.dump(documents, docs_file, pickle.HIGHEST_PROTOCOL) with open(lda_params['model_dir'] + 'lda_params.config', 'w') as config_file: config_file.write(str(lda_params))
def preprocessing(history, current, dollar_df, long, save): history = pr.underscore(history, 1) current = pr.lower(current, 1) #Add new column to separate train and test history['train_flag'] = 1 current['train_flag'] = 0 #Manually map the historical column names to current column names history = history.rename(index=str, columns={ 'zipcode': 'addrzip', 'loanamnt': 'loanamount', 'fundedamnt': 'fundedamount', 'verificationstatus': 'isincv', 'verificationstatusjoint': 'isincvjoint', 'numacctsever120pd': 'numacctsever120ppd' }) #Extract the common column names from history and current common_columns = list(set(history.columns) & set(current.columns)) common_columns.extend(['loanstatus']) #Only keep data with common columns history = history[common_columns] current['loanstatus'] = np.NAN current = current[common_columns] #Convert some datatype in history history.intrate = pr.rstring_to_num(history.intrate, '%', 'float') history.revolutil = pr.rstring_to_num(history.revolutil, '%', 'float') history.earliestcrline = pr.str_dt_num(history.earliestcrline, '%b-%y', 1) history.emplength = pr.emplength_num(history.emplength, 1) history.term = history.term.str[:3].astype('int') #Conver some datatype in current current.earliestcrline = pr.str_dt_num(current.earliestcrline, '%Y-%m-%d', 0) current.emplength = pr.emplength_num(current.emplength, 0) #Only select 3 yr loans for history and current. Only select fully paid or charged off loans history = history[(history.term == 36) & ((history.loanstatus == 'Fully Paid') | (history.loanstatus == 'Charged Off'))] history['loanstatus'] = history.loanstatus.map({ 'Fully Paid': 0, 'Charged Off': 1 }) current = current[(current.term == 36)] #Combine history and current together. The next conversions will be done in both history and current history = history.reset_index().drop(columns='id') current = current.reset_index().drop(columns='index') total = pd.concat([history, current], axis=0) #convert the data from lower_col into lower_keys; get rid of '_' in data from underscore_col lower_col = [ 'applicationtype', 'disbursementmethod', 'initialliststatus', 'isincv', 'isincvjoint' ] underscore_col = ['isincv', 'isincvjoint'] total[lower_col] = pr.lower(total[lower_col], 0) total[underscore_col] = pr.underscore(total[underscore_col], 0) #Convert description into numeric: with description:1; no description:0 total.desc = pr.desc_num(total.desc) #Convert the first three numbers in zipcode to numeric total.addrzip = pr.rstring_to_num(total.addrzip, 'x', 'int') #If not time stringent, lemmatizer emptitle if long == 1: total.emptitle = pr.Lemmatizer(total.emptitle) #unify some spelling total.emptitle = total.emptitle.str.replace('tecnician', 'technitian') total.emptitle = total.emptitle.str.replace('registered ', '') #frequency encoding 'emptitle', 'addrzip', 'addrstate' total = pr.frequency_encoding(total, 'emptitle') total = pr.frequency_encoding(total, 'addrzip') total = pr.frequency_encoding(total, 'addrstate') #ordinal encoding 'grade', 'subgrade' total = pr.ordinal_encoding(total, 'grade', 'subgrade') #Feature engineer to create a new feature: the absolute remained money after debt paid total['remain_income_abs'] = total['annualinc'] * (100 - total['dti']) / 100 total = pd.merge(total, dollar_df, how='left', on='addrstate') #Applied adjustment of dollar value to 'annualinc' and 'remain_income_abs' total['annualinc'] = total['annualinc'] * total['dollar_value'] total['remain_income_abs'] = total['remain_income_abs'] * total[ 'dollar_value'] #sort the column names total = total.reindex(sorted(total.columns), axis=1) #If an output is needed if save == 1: total.to_csv(os.path.join(path, 'total_bf_one_hot_encoding.csv')) #separate features into three types: numerical, categorical and all_null all_null_feature_h, num_feature_h, ob_feature_h = pr.feature_separation( total, total.columns) #Remove 'addrstate', 'emptitle', and 'secappearliestcrline' from categorical feature list ob_feature_h.remove('addrstate') ob_feature_h.remove('emptitle') ob_feature_h.remove('secappearliestcrline') #sort the column names total = total.reindex(sorted(total.columns), axis=1) #If length matches, one hot encoding the rest categorical features if (len(ob_feature_h) + len(num_feature_h) + 4) == len(total.columns): total = pd.concat( [total[num_feature_h], pd.get_dummies(total[ob_feature_h])], axis=1) if save == 1: total.to_csv(os.path.join(path, 'total_one_hot_encoded.csv')) return total