def build_model(): model = NLPModel() with open('lib/data/train.tsv') as f: data = pd.read_csv(f, sep='\t') pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)] pos_neg['Binary'] = pos_neg.apply(lambda x: 0 if x['Sentiment'] == 0 else 1, axis=1) model.vectorizer_fit(pos_neg.loc[:, 'Phrase']) print('Vectorizer fit complete') X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase']) print('Vectorizer tranform complete') y = pos_neg.loc[:, 'Binaryy'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) Print('Model training complete') model.pickle_clf() model.pickle_vectorizer() model.plot_roc(X_train, y_test)
def build_model(): model = NLPModel() # filename = os.path.join( # os.path.dirname(__file__), 'chalicelib', 'all/train.tsv') with open('lib/data/train.tsv') as f: data = pd.read_csv(f, sep='\t') pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)] pos_neg['Binary'] = pos_neg.apply(lambda x: 0 if x['Sentiment'] == 0 else 1, axis=1) model.vectorizer_fit(pos_neg.loc[:, 'Phrase']) print('Vectorizer fit complete') X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase']) print('Vectorizer transform complete') y = pos_neg.loc[:, 'Binary'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) print('Model training complete') model.pickle_clf() model.pickle_vectorizer() model.plot_roc(X_test, y_test, size_x=12, size_y=12)
def build_model(): # builds sentiment classifier and vectorizer model = NLPModel() train_data_dir = 'lib/data/train.tsv' with open(train_data_dir) as f: data = pd.read_csv(f, sep='\t') pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)] pos_neg['Binary'] = pos_neg.apply( lambda x: 0 if x['Sentiment'] == 0 else 1, axis=1) model.vectorizer_fit(pos_neg.loc[:, 'Phrase']) X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase']) print('Vectorizer transform complete') y = pos_neg.loc[:, 'Binary'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) model.pickle_clf() model.pickle_vectorizer() print('Sentiment Classifier Built') # builds diamond price predictor model_two = DiamondPredictor() df = sns.load_dataset('diamonds') train, test = train_test_split(df.copy(), random_state=0) cut_ranks = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5} train.cut = train.cut.map(cut_ranks) test.cut = test.cut.map(cut_ranks) features = ['carat', 'cut'] target = 'price' model_two.train(train[features], train[target]) model_two.pickle_model() print('Diamond Regressor Built')
def build_model(): model = NLPModel() #unzip the dataFiles in the folder where this file is saved before executing the below statements df_extract_combined = pd.read_csv('extract_combined.csv') df_labels = pd.read_csv('labels.csv') df_final = pd.merge(df_extract_combined, df_labels, on='document_name') df_text_data = df_final[['text', 'is_fitara']] for i in range(len(df_text_data)): df_text_data['text'][i] = re.sub('[^a-zA-Z]', ' ', df_text_data['text'][i]) df_text_data['text'] = df_text_data['text'].apply(applyLemmatizer) #df_text_data['text'] = df_text_data['text'].apply(stopwords) le = LabelEncoder() df_text_data['is_fitara'] = le.fit_transform(df_text_data['is_fitara']) model.vectorizer_fit(df_text_data.loc[:, 'text']) #print('Vectorizer fit complete') X = model.vectorizer_transform(df_text_data.loc[:, 'text']) #print('Vectorizer transform complete') y = df_text_data.loc[:, 'is_fitara'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) #print('Model training complete') model.pickle_clf() model.pickle_vectorizer()
def build_model(): model = NLPModel() data = pd.read_csv('extract_combined.csv') data2 = pd.read_csv('labels.csv', error_bad_lines=False) merged = pd.merge(data, data2) yn = {'Yes': 1, 'No': 0} merged.is_fitara = [yn[i] for i in merged.is_fitara] model.vectorizer_fit(data.loc[:, 'text']) print('Vectorizer fit complete') X = model.vectorizer_transform(data.loc[:, 'text']) print('Vectorizer transform complete') y = merged.loc[:, 'is_fitara'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) print('Model training complete') model.pickle_clf() model.pickle_vectorizer()
def build_model(): model = NLPModel() with open('./data/train.tsv') as f: data = pd.read_csv(f, sep='\t') print(data.columns) pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)] pos_neg['Binary'] = np.where(pos_neg['Sentiment'] == 0, 0, 1) model.vectorizer_fit(pos_neg.loc[:, 'Phrase']) print('Vectorizer fit complete') X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase']) print('Vectorizer transform complete') y = pos_neg.loc[:, 'Binary'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) print('Model training complete') model.pickle_clf() model.pickle_vectorizer()