def MakePrediction(nr, title, location, price, nr_pics, description, nr_links, contact, note, coord, car_model, cylinders, drive, fuel, odometer, color, car_size, car_status, transmission, car_type): #check model if "honda civic" in car_model.lower(): learner = pickle.load( open("RFLearnerHondaCivicNoTextNYC" + str(nr) + ".p", "rb")) color_dict = pickle.load(open("Color_HondaCivic_Dict.p", "rb")) cartype_dict = pickle.load(open("CarType_HondaCivic_Dict.p", "rb")) location_dict = pickle.load(open("LocationDict_HondaCivic.p", "rb")) elif "toyota camry" in car_model.lower(): learner = pickle.load( open("RFLearnerToyotaCamryNoTextNYC" + str(nr) + ".p", "rb")) color_dict = pickle.load(open("Color_ToyotaCamry_Dict.p", "rb")) cartype_dict = pickle.load(open("CarType_ToyotaCamry_Dict.p", "rb")) location_dict = pickle.load(open("LocationDict_ToyotaCamry.p", "rb")) elif "nissan altima" in car_model.lower(): learner = pickle.load( open("RFLearnerNissanAltimaNoTextNYC" + str(nr) + ".p", "rb")) color_dict = pickle.load(open("Color_NissanAltima_Dict.p", "rb")) cartype_dict = pickle.load(open("CarType_NissanAltima_Dict.p", "rb")) location_dict = pickle.load(open("LocationDict_NissanAltima.p", "rb")) len_title = len(title.split()) len_description = len(description.split()) if color in color_dict.keys(): color = color_dict[color] else: color = -100 if car_type in cartype_dict.keys(): car_type = cartype_dict[car_type] else: car_type = -100 if location in location_dict.keys(): location = location_dict[location] else: location = -100 car_status = 1 if car_status == 'clean' else 0 cylinders = 0 if cylinders == "" else 1 drive = 0 if drive == "" else 1 transmission = 1 if transmission == "automatic" else 0 fuel = 1 if fuel == 'gas' else 0 year = car_model.split(" ") year = year[0] if odometer < 1000 and year < 2015: odometer *= 1000 if odometer == "": odometer = -10000 ratio_sentences_words = nlp.sentence_count( description.decode('utf-8')) / (len_description + 1.) ratio_sentences_words /= (len_description + 1.) lex_diversity = nlp.lexical_diversity(description.decode('utf-8')) X1 = pd.Series([ contact, nr_pics, price, len_title, len_description, coord, cylinders, drive, fuel, odometer, color, car_status, transmission, year, car_type, nr_links, ratio_sentences_words, lex_diversity, location ], index=[ 'Contact', 'NrPics', 'Price', 'LenTitle', 'LenDescription', 'COORD', 'Cylinders', 'Drive', 'Fuel', 'Odometer', 'Color', 'CarStatus', 'Transmission', 'Year', 'CarType', 'NrLinks', 'RatioSentencesWords', 'LexDiversity', 'LocationCats' ]) prob = learner.predict_proba(X1.values) prob = prob.ravel() prob = int(prob[1] * 100) X1S = X1.copy() max_prob = prob best_pic = nr_pics best_price = price best_desc = len_description best_ratio = ratio_sentences_words best_lex = lex_diversity best_pic_index = 1 best_price_index = 1 best_desc_index = 1 best_ratio_index = 1 best_lex_index = 1 changes = [0, 1, 3] for ch_pic in changes: #50% less, 100% more if X1S['NrPics'] != 0: X1S['NrPics'] = int((ch_pic * 0.5 + 0.5) * nr_pics) else: X1S['NrPics'] = 0 best_pic_index = 1 for pr in range(3): #100,95,90%, X1S['Price'] = (100 - pr * 5) * 0.01 * price for len_des in changes: X1S['LenDescription'] = int( (len_des * 0.5 + 0.5) * len_description) for rs in range(3): X1S['RatioSentencesWords'] = ( rs * 0.5 + 0.5) * X1['RatioSentencesWords'] for ld in range(3): X1S['LexDiversity'] = int(ld * 0.5 + 0.5) * X1['LexDiversity'] prob1 = learner.predict_proba(X1S.values) prob1 = prob1.ravel() prob1 = int(prob1[1] * 100) if prob1 > max_prob: best_pic = X1S['NrPics'] best_pic_index = ch_pic best_price = X1S['Price'] best_price_index = pr best_desc = X1S['LenDescription'] best_desc_index = len_des best_ratio = X1S['RatioSentencesWords'] best_ratio_index = rs best_lex = X1S['LexDiversity'] best_lex_index = ld max_prob = prob1 messages = [] print ratio_sentences_words, 0.5 * X1['RatioSentencesWords'], best_ratio if best_pic_index == 0: messages.append("- Include half as many pictures. \n") elif best_pic_index == 3: messages.append("- Include twice as many pictures. \n") if best_price_index == 1: messages.append( "- Reduce the price by 5 percent to {} dollars. \n".format(price * 0.95)) elif best_price_index == 2: messages.append( "- Reduce the price by 10 percent to {} dollars. \n".format( price * 0.9)) if best_desc_index == 0: messages.append( "- Reduce the number of words in the description by 50 percent. \n" ) elif best_desc_index == 3: messages.append( "- Include twice as many words in the description. \n") if best_ratio_index == 0: messages.append("- Use longer sentences in the description. \n") elif best_ratio_index == 2: messages.append("- Use shorter sentences in the description. \n") if best_lex_index == 0: messages.append("- Formulate a less lexically diverse description.\n") elif best_lex_index == 2: messages.append("- Formulate a more lexically diverse description. \n") if len(messages) > 0: messages.insert( 0, "Your car will sell within {} days with a probability of {} percent if you \n" .format(nr, max_prob)) return (prob, messages)
def MakePrediction(nr,title,location, price,nr_pics,description,nr_links,contact,note,coord,car_model, cylinders, drive, fuel, odometer, color, car_size, car_status, transmission, car_type): #check model if "honda civic" in car_model.lower(): learner = pickle.load( open( "RFLearnerHondaCivicNoTextNYC"+str(nr)+".p", "rb" ) ) color_dict = pickle.load( open("Color_HondaCivic_Dict.p","rb") ) cartype_dict = pickle.load( open("CarType_HondaCivic_Dict.p","rb") ) location_dict = pickle.load( open("LocationDict_HondaCivic.p","rb") ) elif "toyota camry" in car_model.lower() : learner = pickle.load( open( "RFLearnerToyotaCamryNoTextNYC"+str(nr)+".p", "rb" ) ) color_dict = pickle.load( open("Color_ToyotaCamry_Dict.p","rb") ) cartype_dict = pickle.load( open("CarType_ToyotaCamry_Dict.p","rb") ) location_dict = pickle.load( open("LocationDict_ToyotaCamry.p","rb") ) elif "nissan altima" in car_model.lower() : learner = pickle.load( open( "RFLearnerNissanAltimaNoTextNYC"+str(nr)+".p", "rb" ) ) color_dict = pickle.load( open("Color_NissanAltima_Dict.p","rb") ) cartype_dict = pickle.load( open("CarType_NissanAltima_Dict.p","rb") ) location_dict = pickle.load( open("LocationDict_NissanAltima.p","rb") ) len_title = len(title.split()) len_description = len(description.split()) if color in color_dict.keys(): color = color_dict[color] else: color = -100 if car_type in cartype_dict.keys(): car_type = cartype_dict[car_type] else: car_type = -100 if location in location_dict.keys(): location = location_dict[location] else: location = -100 car_status = 1 if car_status == 'clean' else 0 cylinders = 0 if cylinders == "" else 1 drive= 0 if drive == "" else 1 transmission = 1 if transmission == "automatic" else 0 fuel = 1 if fuel == 'gas' else 0 year = car_model.split(" ") year = year[0] if odometer < 1000 and year < 2015: odometer *= 1000 if odometer == "": odometer = -10000 ratio_sentences_words = nlp.sentence_count(description.decode('utf-8'))/(len_description+1.) ratio_sentences_words /= (len_description+1.) lex_diversity = nlp.lexical_diversity(description.decode('utf-8')) X1 = pd.Series([contact,nr_pics,price,len_title ,len_description, coord, cylinders,drive,fuel,odometer,color,car_status, transmission, year, car_type, nr_links,ratio_sentences_words,lex_diversity, location] , index = ['Contact','NrPics','Price','LenTitle','LenDescription','COORD', 'Cylinders','Drive','Fuel','Odometer','Color','CarStatus','Transmission','Year','CarType','NrLinks','RatioSentencesWords','LexDiversity','LocationCats']) prob = learner.predict_proba(X1.values) prob = prob.ravel() prob = int(prob[1] * 100) X1S = X1.copy() max_prob = prob best_pic = nr_pics best_price = price best_desc = len_description best_ratio = ratio_sentences_words best_lex = lex_diversity best_pic_index = 1 best_price_index = 1 best_desc_index = 1 best_ratio_index = 1 best_lex_index = 1 changes = [0,1,3] for ch_pic in changes: #50% less, 100% more if X1S['NrPics'] != 0: X1S['NrPics'] = int((ch_pic*0.5 + 0.5) *nr_pics) else: X1S['NrPics'] = 0 best_pic_index = 1 for pr in range(3): #100,95,90%, X1S['Price'] = (100- pr*5) *0.01 * price for len_des in changes: X1S['LenDescription'] = int((len_des *0.5 +0.5) * len_description) for rs in range(3): X1S['RatioSentencesWords'] = (rs*0.5 + 0.5) * X1['RatioSentencesWords'] for ld in range(3): X1S['LexDiversity'] = int(ld*0.5 + 0.5) * X1['LexDiversity'] prob1 = learner.predict_proba(X1S.values) prob1 = prob1.ravel() prob1 = int(prob1[1] * 100) if prob1 > max_prob: best_pic = X1S['NrPics'] best_pic_index = ch_pic best_price = X1S['Price'] best_price_index = pr best_desc = X1S['LenDescription'] best_desc_index = len_des best_ratio = X1S['RatioSentencesWords'] best_ratio_index = rs best_lex = X1S['LexDiversity'] best_lex_index = ld max_prob = prob1 messages = [] print ratio_sentences_words,0.5*X1['RatioSentencesWords'],best_ratio if best_pic_index == 0: messages.append("- Include half as many pictures. \n") elif best_pic_index == 3: messages.append("- Include twice as many pictures. \n") if best_price_index == 1: messages.append("- Reduce the price by 5 percent to {} dollars. \n".format(price*0.95)) elif best_price_index == 2: messages.append("- Reduce the price by 10 percent to {} dollars. \n".format(price*0.9)) if best_desc_index == 0: messages.append("- Reduce the number of words in the description by 50 percent. \n") elif best_desc_index == 3: messages.append("- Include twice as many words in the description. \n") if best_ratio_index == 0: messages.append("- Use longer sentences in the description. \n") elif best_ratio_index == 2: messages.append("- Use shorter sentences in the description. \n") if best_lex_index == 0: messages.append("- Formulate a less lexically diverse description.\n") elif best_lex_index == 2: messages.append("- Formulate a more lexically diverse description. \n") if len(messages) > 0: messages.insert(0,"Your car will sell within {} days with a probability of {} percent if you \n".format(nr, max_prob)) return (prob, messages)
def trainModel(): con = mdb.connect('localhost', 'charlotte', 'insight', 'LocalClassifieds') df = pd.read_sql("SELECT * FROM ToyotaCamry", con) #df = pd.read_sql("SELECT * FROM NissanAltima", con) data = df[ (df['SoldDays'] > 0) & (df['CarModel'].str.contains(r"[tT][oO][yY][oO][tT][aA] [cC][aA][mM][rR][yY]")) ] #data = df[ (df['SoldDays'] > 0) & (df['CarModel'].str.contains(r"[nN][iI][sS][sS][aA][nN] [aA][lL][tT][iI][mM][aA]")) ] data.reset_index(inplace = True) print "Number of training items: {}".format(len(data)) year = data['CarModel'].str.split(" ") data['Year'] = [int(y[0]) for y in year] data['Odometer'] = data['Odometer'].convert_objects(convert_numeric=True) data.loc[(data['Odometer'] < 1000) & (data['Year'] < 2015),'Odometer'] *= 1000 #often, 178000 given as 178etc. data.loc[data['Odometer'].isnull(), 'Odometer'] = -10000 #variables: length of title, length of description, lexical diversity, nr sentences/nr words data['LenTitle'] = data['Title'].str.split().apply(lambda x: len(x)) -1 data['LenDescription'] = data['Description'].str.split().apply(lambda x: len(x)) data['RatioSentencesWords'] = [nlp.sentence_count(text.decode('utf-8')) for text in data['Description'].values] data['RatioSentencesWords'] = data['RatioSentencesWords']/ (data['LenDescription']+1.) data['LexDiversity'] = [nlp.lexical_diversity(text.decode('utf-8')) for text in data['Description'].values] data.loc[(data['Fuel'] == 'gas'), 'Fuel'] = 1 data.loc[data['Fuel'] != 1, 'Fuel']= 0 data.loc[(data['Transmission'] == 'automatic'), 'Transmission'] = 1 data.loc[data['Transmission'] != 1, 'Transmission']= 0 data.loc[(data['Cylinders'] == ""), 'Cylinders'] = 0 data.loc[data['Cylinders'] != 0, 'Cylinders']= 1 data.loc[(data['Drive'] == ""), 'Drive'] = 0 data.loc[data['Drive'] != 0, 'Drive']= 1 data.loc[(data['CarStatus'] == "clean"), 'CarStatus'] = 1 data.loc[data['CarStatus'] != 1, 'CarStatus']= 0 for col in data[['Color','CarType']].columns: categories = data[col].unique() cat_dict = {} i=1 for cat in categories: cat_dict.update({cat: i}) i += 1 data[col] = data[col].map(cat_dict) #par = pickle.dump(cat_dict, open( col+"_ToyotaCamry_Dict.p", "wb" ) ) par = pickle.dump(cat_dict, open( col+"_NissanAltima_Dict.p", "wb" ) ) data.loc[data['Location'].isnull(), 'Location'] = "" data['Location'] = data['Location'].str.lower() locs = data['Location'].unique() loc_cts = data['Location'].value_counts() loc_dict = {} i=1 for loc in locs: if loc_cts[loc] >= 5: loc_dict.update({loc: i}) else: loc_dict.update({loc: 0}) i += 1 data['LocationCats'] = data['Location'].map(loc_dict) #par = pickle.dump(loc_dict,open("LocationDict_ToyotaCamry.p","wb")) par = pickle.dump(loc_dict,open("LocationDict_NissanAltima.p","wb")) X1 = data[['Contact','NrPics','Price','LenTitle','LenDescription','COORD','Cylinders','Drive','Fuel', 'Odometer','Color','CarStatus','Transmission','Year','CarType','NrLinks','LexDiversity', 'RatioSentencesWords','LocationCats']].values DataDict = {} DataDict.update({0: 'Contact', 1: 'NrPics', 2 :'Price', 3: 'LenTitle', 4 : 'LenDescription', 5: 'COORD', 6: 'Cylinders', 7 :'Drive', 8: 'Fuel', 9: 'Odometer',10: 'Color', 11: 'CarStatus', 12: 'Transmission', 13: 'Year', 14: 'CarType',15:'NrLinks', 16: 'LexDiversity',17:'RatioSentencesWords', 18: 'LocationCats'}) X = X1 max_scores = np.zeros(31) best_feat = np.zeros(31) best_depth = np.zeros(31) features = [] #params = open('BestParamsToyotaCamry_NY_NoText.txt', 'w+') params = open('BestParamsNissanAltima_NY_NoText.txt', 'w+') for n in range(1,21,1): data['Sold'] = 0 data.loc[data['SoldDays'] <= n, 'Sold'] = 1 YC = data['Sold'] #Classification model output variable print data['Sold'].value_counts() for max_feat in range(2,16,2): for depth in range(30,110,20): n_folds = 10 cv = sklearn.cross_validation.StratifiedKFold(YC,n_folds,shuffle = True) scores = np.zeros(n_folds) for f,(train, test) in enumerate(cv): learner = RandomForestClassifier(n_estimators=300, max_depth=depth, max_features= max_feat) learner.fit(X[train,:],YC[train]) probs = learner.predict_proba(X[test,:]) if probs.shape[1] > 1: fpr, tpr, thresholds = sklearn.metrics.roc_curve(YC[test], probs[:,1]) scores[f] = sklearn.metrics.auc(fpr,tpr) else: scores[f] = 0 if np.mean(scores) > max_scores[n]: max_scores[n] = np.mean(scores) best_feat[n] = max_feat best_depth[n] = depth features.append( learner.feature_importances_) print "For n = {}, best AUC is {} for max feat = {}, max depth = {}".format(n, max_scores[n], best_feat[n], best_depth[n]) params.write("For n = {}, best AUC is {} for max feat = {}, max depth = {}\n".format(n, max_scores[n], best_feat[n], best_depth[n])) coeff = features[n] sorted_coeff = np.sort(coeff) sorted_indices = np.argsort(coeff) for i in range(1,19): index = sorted_indices[len(coeff)-i] print DataDict[index], sorted_coeff[len(coeff)-i] params.close() for n in range(1,21,1): print n, best_depth[n], best_feat[n] data['Sold'] = 0 data.loc[data['SoldDays'] <= n, 'Sold'] = 1 YC = data['Sold'] learner = RandomForestClassifier(n_estimators=500, max_depth=best_depth[n], max_features= int(best_feat[n])) learner.fit(X,YC) #par = pickle.dump(learner, open( "RFLearnerToyotaCamryNoTextNYC"+str(n)+".p", "wb" ) ) #par = pickle.dump(learner, open( "RFLearnerNissanAltimaNoTextNYC"+str(n)+".p", "wb" ) ) for n in range(3,18,3): data['Sold'] = 0 data.loc[data['SoldDays'] <= n, 'Sold'] = 1 YC = data['Sold'] n_folds = 5 mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] cv = sklearn.cross_validation.StratifiedKFold(YC,n_folds,shuffle = True) for f,(train, test) in enumerate(cv): learner = RandomForestClassifier(n_estimators=1000, max_depth=best_depth[n], max_features= int(best_feat[n]) ) learner.fit(X[train,:],YC[train]) probs = learner.predict_proba(X[test,:]) if probs.shape[1] > 1: fpr, tpr, thresholds = sklearn.metrics.roc_curve(YC[test], probs[:,1]) auc = sklearn.metrics.auc(fpr,tpr) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = sklearn.metrics.auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, '-', label='Mean CV ROC d =%d (AUC = %0.2f) ' % (n,mean_auc), lw=2) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Toyota Camry: Prob. of sale within d days') #plt.title('ROC Nissan Altima: Prob. of sale within d days') plt.legend(loc="lower right") plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6)) #plt.savefig('ROC_RF_NoText_NissanAltima_NYC.pdf') plt.savefig('ROC_RF_NoText_ToyotaCamry_NYC.pdf')