get_ipython().magic("time print(np.sqrt(-cross_val_score(model, trainingData, trainingData['isSpam'], cv=10, scoring='mean_squared_error')).mean())") return bestRun = [] for _ in range(20): rand = RandomizedSearchCV(model, paramDistribution, cv=10, scoring = 'accuracy', n_iter = 10) rand.fit(trainingData, trainingData['isSpam']) # examine the best model bestRun.append({'score' : round(rand.best_score_,3), 'params' : rand.best_params_}) print(max(bestRun, key=lambda x:x['score'])) return max(bestRun, key=lambda x:x['score']) #read site rawSite = read_mongo(db = 'CB', collection = 'site', host = 'localhost', no_id = False) siteModified = rawSite.drop(['dismissedOnboarding', 'feedCounter', 'feedToken', 'modules', 'password', 'theme', 'photoId', 'requestAccess', 'requestPassword', 'bi', 'photo', 'goFundMe', 'lastName', 'numAmps', 'partner', 'size', 'theme', 'createFormSessionId', 'allowList', 'blockList', 'displayEmail', 'isPhotoOrderingFixed', 'healthCondition', 'spam', 'status', 'firstName', 'lastInvite', 'isDeleted', 'hasCommentFix','age'], axis = 1) viewPort = siteModified.cm.apply(pd.Series).fillna(-1) siteModified['hasJavaScriptOn'] = [0 if vp == -1 else 1 for vp in viewPort.vpw] siteModified.drop(['cm'], axis = 1, inplace = True) siteModified['descriptionLen'] = rawSite.description.str.len() siteModified.drop(['description'], axis = 1, inplace = True) siteModified['nameLen'] = rawSite.name.str.len() siteModified.drop(['name'], axis = 1, inplace = True) siteModified['titleLen'] = rawSite.title.str.len()
final = pd.concat([test, dfWithClass], axis=1) #take a look at the confusion matrix print(pd.crosstab(final.isSpam, final.predictedClass)) print("0s: %d, 1s: %d" % (np.sum((final.isSpam == 0) & (final.predictedClass == 0)), np.sum((final.isSpam == 1) & (final.predictedClass == 1)))) print( "Accuracy: %.3f" % float(np.sum(final.isSpam == final.predictedClass) / float(len(test)))) print("Precision: %.3f" % float( np.sum((final.isSpam == 1) & (final.predictedClass == 1)) / np.sum(final.isSpam == 1))) #read journals rawJournals = read_mongo(db='CB', collection='journal', host='localhost') journals = pd.DataFrame(list(rawJournals['body']), columns=['content']) journals['siteId'] = rawJournals['siteId'] journals['text'] = rawJournals['title'].astype(str) + ' ' + journals['content'] journals.drop(['content'], inplace=True, axis=1) #read siteIds rawSite = read_mongo(db='CB', collection='site', host='localhost', no_id=False) siteIds = pd.DataFrame(list(rawSite['_id']), columns=['siteId']) siteIds['isSpam'] = rawSite['isSpam'] siteIds.isSpam.fillna(0, inplace=True) siteIds.rename(columns={'isSpam': 'isSiteSpam'}, inplace=True) #spam data from file octSiteProfileSpam = pd.read_csv( "/Users/dmurali/Documents/spamlist_round25_from_20150809_to_20151015.csv",
paramDistribution, cv=10, scoring='accuracy', n_iter=10) rand.fit(trainingData, trainingData['isSpam']) # examine the best model bestRun.append({ 'score': round(rand.best_score_, 3), 'params': rand.best_params_ }) print(max(bestRun, key=lambda x: x['score'])) return max(bestRun, key=lambda x: x['score']) #read site rawSite = read_mongo(db='CB', collection='site', host='localhost', no_id=False) siteModified = rawSite.drop([ 'dismissedOnboarding', 'feedCounter', 'feedToken', 'modules', 'password', 'theme', 'photoId', 'requestAccess', 'requestPassword', 'bi', 'photo', 'goFundMe', 'lastName', 'numAmps', 'partner', 'size', 'theme', 'createFormSessionId', 'allowList', 'blockList', 'displayEmail', 'isPhotoOrderingFixed', 'healthCondition', 'spam', 'status', 'firstName', 'lastInvite', 'isDeleted', 'hasCommentFix', 'age' ], axis=1) viewPort = siteModified.cm.apply(pd.Series).fillna(-1) siteModified['hasJavaScriptOn'] = [0 if vp == -1 else 1 for vp in viewPort.vpw] siteModified.drop(['cm'], axis=1, inplace=True) siteModified['descriptionLen'] = rawSite.description.str.len() siteModified.drop(['description'], axis=1, inplace=True) siteModified['nameLen'] = rawSite.name.str.len()
predictor.fit(train, train['isSpam']) predicted = predictor.predict(test) dfWithClass = pd.DataFrame(predicted, columns = ['predictedClass']) final = pd.concat([test, dfWithClass], axis=1) #take a look at the confusion matrix print(pd.crosstab(final.isSpam, final.predictedClass)) print("0s: %d, 1s: %d" %(np.sum((final.isSpam == 0) & (final.predictedClass == 0)), np.sum((final.isSpam == 1) & (final.predictedClass == 1)))) print("Accuracy: %.3f" %float(np.sum(final.isSpam == final.predictedClass) / float(len(test)))) print("Precision: %.3f" %float(np.sum((final.isSpam == 1) & (final.predictedClass == 1)) / np.sum(final.isSpam == 1))) #read journals rawJournals = read_mongo(db = 'CB', collection = 'journal', host = 'localhost') journals = pd.DataFrame(list(rawJournals['body']), columns = ['content']) journals['siteId'] = rawJournals['siteId'] journals['text'] = rawJournals['title'].astype(str) + ' ' + journals['content'] journals.drop(['content'], inplace = True, axis = 1) #read siteIds rawSite = read_mongo(db = 'CB', collection = 'site', host = 'localhost', no_id = False) siteIds = pd.DataFrame(list(rawSite['_id']), columns = ['siteId']) siteIds['isSpam'] = rawSite['isSpam'] siteIds.isSpam.fillna(0, inplace = True) siteIds.rename(columns = {'isSpam':'isSiteSpam'}, inplace = True) #spam data from file octSiteProfileSpam = pd.read_csv("/Users/dmurali/Documents/spamlist_round25_from_20150809_to_20151015.csv", usecols = ['siteId','isSpam'])