def build_model(): model = NLPModel() # filename = os.path.join( # os.path.dirname(__file__), 'chalicelib', 'all/train.tsv') with open('lib/data/train.tsv') as f: data = pd.read_csv(f, sep='\t') pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)] pos_neg['Binary'] = pos_neg.apply(lambda x: 0 if x['Sentiment'] == 0 else 1, axis=1) model.vectorizer_fit(pos_neg.loc[:, 'Phrase']) print('Vectorizer fit complete') X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase']) print('Vectorizer transform complete') y = pos_neg.loc[:, 'Binary'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) print('Model training complete') model.pickle_clf() model.pickle_vectorizer() model.plot_roc(X_test, y_test, size_x=12, size_y=12)
def build_model(): model = NLPModel() with open('lib/data/train.tsv') as f: data = pd.read_csv(f, sep='\t') pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)] pos_neg['Binary'] = pos_neg.apply(lambda x: 0 if x['Sentiment'] == 0 else 1, axis=1) model.vectorizer_fit(pos_neg.loc[:, 'Phrase']) print('Vectorizer fit complete') X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase']) print('Vectorizer tranform complete') y = pos_neg.loc[:, 'Binaryy'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) Print('Model training complete') model.pickle_clf() model.pickle_vectorizer() model.plot_roc(X_train, y_test)
def build_model(): # builds sentiment classifier and vectorizer model = NLPModel() train_data_dir = 'lib/data/train.tsv' with open(train_data_dir) as f: data = pd.read_csv(f, sep='\t') pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)] pos_neg['Binary'] = pos_neg.apply( lambda x: 0 if x['Sentiment'] == 0 else 1, axis=1) model.vectorizer_fit(pos_neg.loc[:, 'Phrase']) X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase']) print('Vectorizer transform complete') y = pos_neg.loc[:, 'Binary'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) model.pickle_clf() model.pickle_vectorizer() print('Sentiment Classifier Built') # builds diamond price predictor model_two = DiamondPredictor() df = sns.load_dataset('diamonds') train, test = train_test_split(df.copy(), random_state=0) cut_ranks = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5} train.cut = train.cut.map(cut_ranks) test.cut = test.cut.map(cut_ranks) features = ['carat', 'cut'] target = 'price' model_two.train(train[features], train[target]) model_two.pickle_model() print('Diamond Regressor Built')
def build_model(): model = NLPModel() #unzip the dataFiles in the folder where this file is saved before executing the below statements df_extract_combined = pd.read_csv('extract_combined.csv') df_labels = pd.read_csv('labels.csv') df_final = pd.merge(df_extract_combined, df_labels, on='document_name') df_text_data = df_final[['text', 'is_fitara']] for i in range(len(df_text_data)): df_text_data['text'][i] = re.sub('[^a-zA-Z]', ' ', df_text_data['text'][i]) df_text_data['text'] = df_text_data['text'].apply(applyLemmatizer) #df_text_data['text'] = df_text_data['text'].apply(stopwords) le = LabelEncoder() df_text_data['is_fitara'] = le.fit_transform(df_text_data['is_fitara']) model.vectorizer_fit(df_text_data.loc[:, 'text']) #print('Vectorizer fit complete') X = model.vectorizer_transform(df_text_data.loc[:, 'text']) #print('Vectorizer transform complete') y = df_text_data.loc[:, 'is_fitara'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) #print('Model training complete') model.pickle_clf() model.pickle_vectorizer()
def train_model(): path = 'lib/data/imdb_labelled.txt' data = pd.read_csv(path, sep='\t', header=None) data.columns = ['text', 'score'] reviews_train, reviews_test, y_train, y_test = train_test_split( data['text'], data['score'], test_size=0.2, ) model = NLPModel() model.fit_vectorizer(reviews_train) X_train = model.transform_vectorizer(reviews_train) X_test = model.transform_vectorizer(reviews_test) model.train(X_train, y_train) model.report_accuracy(X_test, y_test, 'lib/model/accuracy') model.pickle_vectorizer() model.pickle_clf()
def build_model(): model = NLPModel() data = pd.read_csv('extract_combined.csv') data2 = pd.read_csv('labels.csv', error_bad_lines=False) merged = pd.merge(data, data2) yn = {'Yes': 1, 'No': 0} merged.is_fitara = [yn[i] for i in merged.is_fitara] model.vectorizer_fit(data.loc[:, 'text']) print('Vectorizer fit complete') X = model.vectorizer_transform(data.loc[:, 'text']) print('Vectorizer transform complete') y = merged.loc[:, 'is_fitara'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) print('Model training complete') model.pickle_clf() model.pickle_vectorizer()
def build_model(): model = NLPModel() with open('./data/train.tsv') as f: data = pd.read_csv(f, sep='\t') print(data.columns) pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)] pos_neg['Binary'] = np.where(pos_neg['Sentiment'] == 0, 0, 1) model.vectorizer_fit(pos_neg.loc[:, 'Phrase']) print('Vectorizer fit complete') X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase']) print('Vectorizer transform complete') y = pos_neg.loc[:, 'Binary'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) print('Model training complete') model.pickle_clf() model.pickle_vectorizer()
def build_model(): model = NLPModel() with open('data/train.csv') as f: data = pd.read_csv(f, sep=',', header=0) # Use only the 1 star and 5 star reviews # For this example, we want to only predict positive or negative sentiment using the extreme cases. pos_neg = data[(data['Rating'] <= 2) | (data['Rating'] >= 4)] ## Relabel as 0 for negative and 1 for positive¶ pos_neg['Binary'] = pos_neg.apply(lambda x: 0 if x['Rating'] < 2 else 1, axis=1) #Fit a vectorizer to the vocabulary in the dataset #pos_neg.loc[:, 'Phrase'] pos_neg.dropna(subset=['Review Text'], inplace=True) X = model.vectorizer_fit_transform(pos_neg.loc[:, 'Review Text']) print('Vectorizer fit transform complete') y = pos_neg.loc[:, 'Binary'] # split X and y into training and testing sets # by default, it splits 75% training and 25% test # random_state=1 for reproducibility X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) #print(X_train.shape) #print(X_test.shape) #print(y_train.shape) #print(y_test.shape) model.train(X_train, y_train) print('Model training complete') model.pickle_clf() model.pickle_vectorizer()