def __CVD( data, exp ): kn = Evaluation.Evaluator() X,Y,y_raw = Features.getSamples( kn, data ) data.maxWords = 10000 kf = StratifiedKFold( n_splits=10, shuffle=True ) k = 0 for train, test in kf.split( X, y_raw ): print( "K-Fold: " + str( k + 1 ) ); x_train_raw, x_test_raw = X[train], X[test] y_train, y_test = Y[train], Y[test] x_train, x_test = Features.getCVvectors( x_train_raw, x_test_raw, data ) denseSizes = [512,1024] batches = [64,128] dropouts = [2,3] param_grid = dict( batch_size=batches, denseSize=denseSizes, dropout=dropouts, input_length=[len(x_train[0])], output_length=[len(y_train[0])] ) model = KerasClassifier(build_fn=Models.create_ann_model, epochs=30, verbose=2) y_ints = [y.argmax() for y in y_train] cweights = class_weight.compute_class_weight( 'balanced', np.unique( y_ints ), y_ints ) grid = GridSearchCV(estimator=model, param_grid=param_grid) grid_result = grid.fit(x_train, y_train, class_weight=cweights) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) model, scores = kn.evaluateModel( x_test, y_test, grid.best_estimator_.model, data, k ) k = k + 1 kn.saveResults( exp )
def feat5(train, test): train_valence, test_valence = feat1(train, test) puncter, train_punct = Features.punctuation(train) _, test_punct = Features.punctuation(test, vectorizer = puncter) train_matrix = Features.append_features([train_valence, train_punct]) test_matrix = Features.append_features([test_valence, test_punct]) return train_matrix, test_matrix
def __EmbeddedRNN( data, exp, filepath, network ): kn = Evaluation.Evaluator() X,Y,y_raw = Features.getSamples( kn, data ) data.maxWords = 10000 kf = StratifiedKFold( n_splits=10, shuffle=True ) k = 0 for train, test in kf.split( X, y_raw ): print( "K-Fold: " + str( k + 1 ) ); x_train_raw, x_test_raw = X[train], X[test] y_train, y_test = Y[train], Y[test] Models.embedding, x_train, x_test = Features.getEmbedded( x_train_raw, x_test_raw, y_train, y_test, y_raw, filepath, kn ) batches = [64,218] neurons = [100,200] dropouts = [2,3] param_grid = dict( batch_size=batches, neuron=neurons, dropout=dropouts, output_size=[len(y_train[0])] ) model = None if network == 'lstm': model = KerasClassifier(build_fn=Models.create_lstm_model, epochs=30, verbose=2) else: model = KerasClassifier(build_fn=Models.create_gru_model, epochs=30, verbose=2) y_ints = [y.argmax() for y in y_train] cweights = class_weight.compute_class_weight( 'balanced', np.unique( y_ints ), y_ints ) grid = GridSearchCV(estimator=model, param_grid=param_grid) grid_result = grid.fit(x_train, y_train, class_weight=cweights) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) model, scores = kn.evaluateModel( x_test, y_test, grid.best_estimator_.model, data, k ) k = k + 1 kn.saveResults( exp )
def feat2(train, test): state_info, train_matrix = Features.tfIdfSkLearn(train, stop_words="english") _, test_matrix = Features.tfIdfSkLearn(test, vectorizer=state_info, stop_words='english') return train_matrix, test_matrix
def fromContracts(cls, names, lookback, interval): adjustedContracts = AdjustedContracts.initialize(names) DATA = [[[]]] TARGETS = [[[]]] for adjustedContract in adjustedContracts: dataPoints, targets = Features.concatenateDataPoints( Features.computeFeatures(adjustedContract), lookback) DATA.append(dataPoints) TARGETS.append(targets) print(adjustedContract.name) DATA.pop(0) TARGETS.pop(0) Export.exportTXT( DATA, "C:/Users/dream/Desktop/4th Year/Final Project/DATA BACKUP/MLDataSet/nonsplitDataPoints_" + str(lookback) + ".txt") Export.exportTXT( TARGETS, "C:/Users/dream/Desktop/4th Year/Final Project/DATA BACKUP/MLDataSet/nonsplitTargets_" + str(lookback) + ".txt") return cls(DATA, TARGETS, interval)
def feat5(train, test): train_valence, test_valence = feat1(train, test) puncter, train_punct = Features.punctuation(train) _, test_punct = Features.punctuation(test, vectorizer=puncter) train_matrix = Features.append_features([train_valence, train_punct]) test_matrix = Features.append_features([test_valence, test_punct]) return train_matrix, test_matrix
def Process1(df): pri_id = "企业名称" res = pd.DataFrame() res[pri_id] = df[pri_id].unique() # 转换币种 df = prep.Convert_money(df) # 提取注册资金特征(最大值,最小值,均值,方差) res = pd.merge(res,fea.GetValAvg(df,pri_id,"注册资金(元)"),on=pri_id) res = pd.merge(res,fea.GetValMaxMin(df,pri_id,"注册资金(元)"),on=pri_id) res = pd.merge(res,fea.GetValVar(df,pri_id,"注册资金(元)"),on=pri_id) # 提取类别特征 num_fea = ['注册资金(元)',"出资比例"] cat_fea = [col for col in df.columns if col != pri_id and col not in num_fea] for col in cat_fea: res = pd.merge(res,fea.GetCategroicalCount(df,pri_id,col),on=pri_id) # 法定代表人和首席代表标志为空统计 res = pd.merge(res,fea.GetValNaCount(df,pri_id,"法定代表人标志","姓名"),on=pri_id) res = pd.merge(res,fea.GetValNaCount(df,pri_id,"首席代表标志","姓名"),on=pri_id) # 统计 相应职务个树 res = pd.merge(res,fea.CatRowsToCols(df,pri_id,"职务","姓名")) # 提取出资比例(最大值,最小值,均值,方差) res = pd.merge(res,fea.GetValAvg(df,pri_id,"出资比例"),on=pri_id) res = pd.merge(res,fea.GetValMaxMin(df,pri_id,"出资比例"),on=pri_id) res = pd.merge(res,fea.GetValVar(df,pri_id,"出资比例"),on=pri_id) return res
def _F_LabelCount(): if os.path.exists(data_path + "data/_F_d_label_count.feather"): df = feather.read_dataframe(data_path + "data/_F_d_label_count.feather") return df temp = pd.DataFrame() temp[pri_id] = pd.concat((_train[pri_id], _test[pri_id])) # trans_type1 temp = temp.merge(_F.DangerousLabel(trans_info, pri_id, 'trans_type1', 'ced62357ad496957', 0.8), on=pri_id, how='left') # trans_type2 temp = temp.merge(_F.DangerousLabel(trans_info, pri_id, 'trans_type2', 104, 0.8), on=pri_id, how='left') # amt_src1 temp = temp.merge(_F.DangerousLabel(trans_info, pri_id, 'amt_src1', 'c4ec9622cf5c6e55', 0.8), on=pri_id, how='left') # amt_src2 temp = temp.merge(_F.DangerousLabel(trans_info, pri_id, 'amt_src2', 'c4ec9622cf5c6e55', 0.8), on=pri_id, how='left') feather.write_dataframe(temp, data_path + "data/_F_d_label_count.feather") return temp
def user_features(): """ Controller which handles features like similar users, sentiment analysis, associated tags :return: corresponding page according to user`s selection of feature """ cnx = Connection.connectToDatabase(config.db_config) if request.method == 'POST': if 'SU' in request.form: users_list = Features.similarUsers(cnx, session.get("user", None), session.get("domain", None)) return render_template('similarusers.html', users_list=users_list) elif 'AT' in request.form: tags_list = Features.tagsAssoPerson(cnx, session.get("user", None), session.get("domain", None)) return render_template('associatedtags.html', tags_list=tags_list) elif 'PS' in request.form: mentions, sentiment = Features.peopleSaying( cnx, session.get("user", None)) influence = Features.whatsMyInfluence(cnx, session.get("user", None)) viral = Features.viralUserTweets(cnx, session.get("user", None)) return render_template("sentiment.html", mentions=mentions, sentiment=sentiment, influence=influence, viral=viral)
def feat6_generic(train, test, train_pos, test_pos): train_f5, test_f5 = feat5(train, test) cter, train_cts = Features.keyPOSNGrams(train_pos, ["jj.*", "vb.*"], tf_idf = True) _, test_cts = Features.keyPOSNGrams(test_pos, ["jj.*", "vb.*"], vectorizer = cter, tf_idf= True) train_matrix = Features.append_features([train_f5, train_cts]) test_matrix = Features.append_features([test_f5, test_cts]) return train_matrix, test_matrix
def WStack(): import Wesleyan data = Wesleyan.Wesleyan() kn = Evaluation.Evaluator() exp = "WStack" filepath = 'enwiki_20180420_300d.txt' X,Y,y_raw = Features.getSamples( kn, data ) data.maxWords = 10000 kf = StratifiedKFold( n_splits=10, shuffle=True ) k = 0 for train, test in kf.split( X, y_raw ): print( "K-Fold: " + str( k + 1 ) ); x_train_raw, x_test_raw = X[train], X[test] y_train, y_test = Y[train], Y[test] y_ints = [y.argmax() for y in y_train] cweights = class_weight.compute_class_weight( 'balanced', np.unique( y_ints ), y_ints ) Models.embedding, x_train, x_test = Features.getEmbedded( x_train_raw, x_test_raw, y_train, y_test, y_raw, filepath, kn ) y_pred_train = None y_pred_test = None def do_cnn(): model1 = Models.create_cnn_model( pool_size=3, layer_size=128, output_size=len(y_train[0]) ) model1.fit(x_train, y_train, epochs=15, verbose=2, batch_size=32, class_weight=cweights) y_pred_train = model1.predict( x_train, verbose=0 ) y_pred_test = model1.predict( x_test, verbose=0 ) model1 = None do_cnn() model2 = Models.create_ann_model( dropout=3, denseSize=512, output_length=len(y_train[0]) ) model2.fit(x_train, y_train, batch_size=64, verbose=2, epochs=30, class_weight=cweights) y_pred_train2 = model2.predict( x_train, verbose=0 ) y_pred_test2 = model2.predict( x_test, verbose=0 ) model2 = None x_train, x_test = Features.getCVvectors( x_train_raw, x_test_raw, data ) model3 = Models.create_ann_model( batch_size=64, denseSize=1024, dropout=3, input_length=[len(x_train[0])], output_length=[len(y_train[0])] ) model3.fit(x_train, y_train, epochs=100, class_weight=cweights) y_pred_train3 = model3.predict( x_train, verbose=0 ) y_pred_test3 = model3.predict( x_test, verbose=0 ) new_x_train = np.stack( (y_pred_train, y_pred_train2, y_train_3 ), axis=-1) new_x_test = np.stack( (y_pred_test, y_pred_test2, y_test_3 ), axis=-1) model = Models.create_stack_model( input_size=len(new_x_train[0]), output_size=len(y_train[0]) ) history = model.fit(new_x_train, y_train, epochs=100, verbose=2, batch_size=128, class_weight=cweights ) model, scores = kn.evaluateModel( new_x_test, y_test, model, data, k ) k = k + 1 kn.saveResults( exp )
def display_data(self): logging.info('DISPLAYING TEXELS') Features.show_texel_list(self.texel_features) self.mytimer.tick() logging.info('DISPLAYING DONE')
def feat7(train, test): normal_train, train_pos = map(list, zip(*train)) normal_test, test_pos = map(list, zip(*test)) train_f5, test_f5 = feat5(normal_train, normal_test) cter, train_cts = Features.keyPOSNGrams(train_pos, ["jj.*", "vb.*"], tf_idf = True, ngram_range = (1, 2), stop_words = 'english') _, test_cts = Features.keyPOSNGrams(test_pos, ["jj.*", "vb.*"], vectorizer = cter, tf_idf= True, ngram_range = (1, 2), stop_words = 'english') train_matrix = Features.append_features([train_f5, train_cts]) test_matrix = Features.append_features([test_f5, test_cts]) return train_matrix, test_matrix
def feat4(train, test): # feature set 3 train_f3, test_f3 = feat3(train, test) # punctuation puncter, train_punct = Features.punctuation(train) _, test_punct = Features.punctuation(test, vectorizer = puncter) train_matrix = Features.append_features([train_f3, train_punct]) test_matrix = Features.append_features([test_f3, test_punct]) return train_matrix, test_matrix
def feat7(train, test): # feature set 3 train_f5, test_f5 = feat5(train, test) # punctuation puncter, train_punct = Features.bagOfWordsSkLearn(train) _, test_punct = Features.bagOfWordsSkLearn(test, vectorizer = puncter) train_matrix = Features.append_features([train_f5, train_punct]) test_matrix = Features.append_features([test_f5, test_punct]) return train_matrix, test_matrix
def feat7(train, test): # feature set 3 train_f5, test_f5 = feat5(train, test) # punctuation puncter, train_punct = Features.bagOfWordsSkLearn(train) _, test_punct = Features.bagOfWordsSkLearn(test, vectorizer=puncter) train_matrix = Features.append_features([train_f5, train_punct]) test_matrix = Features.append_features([test_f5, test_punct]) return train_matrix, test_matrix
def feat3(train, test): # valence info train_valence, test_valence = feat1(train, test) # tf idf info train_cts, test_cts = feat2(train, test) # combined info train_matrix = Features.append_features([train_valence, train_cts]) test_matrix = Features.append_features([test_valence, test_cts]) return train_matrix, test_matrix
def feat6(train, test): normal_train, train_pos = map(list, zip(*train)) normal_test, test_pos = map(list, zip(*test)) train_f5, test_f5 = feat5(normal_train, normal_test) cter, train_cts = Features.keyPOSNGrams(train_pos, ["jj.*", "vb.*"], tf_idf = True) _, test_cts = Features.keyPOSNGrams(test_pos, ["jj.*", "vb.*"], vectorizer = cter, tf_idf= True) train_matrix = Features.append_features([train_f5, train_cts]) test_matrix = Features.append_features([test_f5, test_cts]) return train_matrix, test_matrix
def feat4(train, test): # feature set 3 train_f3, test_f3 = feat3(train, test) # punctuation puncter, train_punct = Features.punctuation(train) _, test_punct = Features.punctuation(test, vectorizer=puncter) train_matrix = Features.append_features([train_f3, train_punct]) test_matrix = Features.append_features([test_f3, test_punct]) return train_matrix, test_matrix
def extra_features(train, test): # uni and bigrams state_info, train_ngrams = Features.wordCountsSkLearn(train, ngram_range = (1, 2), stop_words = 'english') _, test_ngrams = Features.wordCountsSkLearn(test, vectorizer = state_info, ngram_range = (1, 2), stop_words = 'english') # valence and punctuation train_valence_punct, test_valence_punct = feat5(train, test) # train matrix train_matrix = Features.append_features([train_ngrams, train_valence_punct]) test_matrix = Features.append_features([test_ngrams, test_valence_punct]) return train_matrix, test_matrix
def __init__(self): self.feature_vec = [features.CrossTermX1X3(), features.SinX2(), features.SquareX4(), features.Identity()] self.feature_weights = [0.1, -2, -0.3, 3] self.noise_model = noise.NoiseModel() self.max_x1 = 10 self.max_x2 = 10 self.max_x3 = 10 self.max_x4 = 10 self.saver = saver.DataSaver('data', 'submission_data.pkl')
def count_labels(outpath): tw_cts = Counter(Features.getY(tw)) blog_cts = Counter(Features.getY(blog)) cts = zip(["twitter+wiki", "blog"], [tw_cts, blog_cts]) # Write out to csv with open(outpath, 'w') as labels_histo_file: for src, counter in cts: for k, v in counter.iteritems(): labels_histo_file.write("%s,%s,%d\n" % (src, k, v)) return 0
def feat6_generic(train, test, train_pos, test_pos): train_f5, test_f5 = feat5(train, test) cter, train_cts = Features.keyPOSNGrams(train_pos, ["jj.*", "vb.*"], tf_idf=True) _, test_cts = Features.keyPOSNGrams(test_pos, ["jj.*", "vb.*"], vectorizer=cter, tf_idf=True) train_matrix = Features.append_features([train_f5, train_cts]) test_matrix = Features.append_features([test_f5, test_cts]) return train_matrix, test_matrix
def feat1(train, test): vectorizer, train_matrix = Features.valenceByFrequency( train, vectorizer=None, cache_valence=cache_valence, stop_words='english') _, test_matrix = Features.valenceByFrequency(test, vectorizer=vectorizer, cache_valence=cache_valence, stop_words='english') return train_matrix, test_matrix
def getSingleFeatureLineFromFile(file, decisions, shot, leave_out_class=None): """ This is a less troublesome but slow method to get a featureLine. """ beatList, context = getContextAndBeatListFromFile(file) blockList = coalesceBeats(beatList) Features.initializeContextVars(context) lastShotId, context, blockList = applyDecisionsToBeatscript(context, blockList, decisions) featureLine = getFeatureLine(context, blockList[len(decisions)], shot, lastShotId, leave_out_class) return featureLine
def feat6(train, test): normal_train, train_pos = map(list, zip(*train)) normal_test, test_pos = map(list, zip(*test)) train_f5, test_f5 = feat5(normal_train, normal_test) cter, train_cts = Features.keyPOSNGrams(train_pos, ["jj.*", "vb.*"], tf_idf=True) _, test_cts = Features.keyPOSNGrams(test_pos, ["jj.*", "vb.*"], vectorizer=cter, tf_idf=True) train_matrix = Features.append_features([train_f5, train_cts]) test_matrix = Features.append_features([test_f5, test_cts]) return train_matrix, test_matrix
def createDataLine(context, block, leaveout=-1): dataLine = [str(block[0].shotId) + "_" + str(block[0].beatId), str(block[0].shot)] featureClassList = Features.getAllFeatureClasses() context = Features.createBeatList(context, block) for featureClass in featureClassList: feature = featureClass(context, block) dataLine += feature.getNumbers() # activate to generate a human readable featureLine #dataLine.append(feature.getText()) if leaveout >= 0: dataLine.pop(leaveout) return dataLine
def _F_nunique_ratio(n=3): if os.path.exists(data_path + "data/_F_nunique_ratio.feather"): df = feather.read_dataframe(data_path + "data/_F_nunique_ratio.feather") return df temp = pd.DataFrame() temp[pri_id] = pd.concat((_train[pri_id], _test[pri_id])) # 增加天和月份的片 df = pd.concat((op_info[[pri_id, 'day']], trans_info[[pri_id, 'day']])) month = _F.Day2Month(df, 'day') temp = pd.merge(temp, _F.TopN_col_distinct_ratio(month[[pri_id, 'month', 'day']], pri_id, 'month', 'day', n), on=pri_id, how='left') df = pd.concat((op_info[[pri_id, 'time', 'day']], trans_info[[pri_id, 'time', 'day']])) df['day_period'] = df['time'].apply(_F.TimeInterval) temp = pd.merge( temp, _F.TopN_col_distinct_ratio(df[[pri_id, 'day_period', 'day']], pri_id, 'day_period', 'day', n)) # topN ratio for col in [ 'geo_code', 'ip1', 'ip1_sub', 'mac1', 'merchant', 'ip2', 'ip2_sub', 'mode', 'mac2', 'os', 'channel', 'trans_type1', 'trans_type2', 'code1', 'code2', 'market_code', 'market_type', 'device_code1', 'device_code2', 'device_code3', 'wifi' ]: if col in [ 'merchant', 'channel', 'trans_type1', 'trans_type2', 'code1', 'code2', 'market_code', 'market_type' ]: df = trans_info[[pri_id, col, 'day']] elif col in ['ip2', 'ip2_sub', 'mode', 'mac2', 'os', 'wifi']: df = op_info[[pri_id, col, 'day']] else: df = pd.concat((op_info[[pri_id, col, 'day']], trans_info[[pri_id, col, 'day']])) temp = pd.merge(temp, _F.TopN_col_distinct_ratio(df, pri_id, col, 'day', n), on=pri_id, how='left') feather.write_dataframe(temp, data_path + "data/_F_nunique_ratio.feather") return temp
def flag_tweet(): """ Controller which handles tweet delete. :return: same page i.e refreshes the page """ cnx = Connection.connectToDatabase(config.db_config) if request.method == 'POST': Features.remove_tweet(cnx, list(request.form.keys())[0]) cnx2 = Connection.connectToDatabase(config.db_config) toptweets = Features.topTen(cnx2, session.get("domain", None)) return render_template('tweetTest.html', your_list=toptweets[0], userlist=toptweets[1])
def classify(data, weights, featureSet, algorithm): length = Features.getLength(featureSet) results = np.zeros(data.shape[0]) for i in range(data.shape[0]): if algorithm == 1: vector = Features.getVector(data[i, 0], featureSet) vector.append(length) results[i] = predict_one(weights, vector, 0) else: vector = Features.getVector(data[i, 0], featureSet) results[i] = predict_one(weights, vector, length) return results
def classify(data, weights, featureSet, algorithm): length = Features.getLength(featureSet) results = np.zeros(data.shape[0]) for i in range(data.shape[0]): if algorithm == 1: vector = Features.getVector(data[i,0], featureSet) vector.append(length) results[i] = predict_one(weights, vector, 0) else: vector = Features.getVector(data[i,0], featureSet) results[i] = predict_one(weights, vector, length) return results
def train(model, training, keys, pca_num=None): if model == "1nn": model = OneNN() elif model == "rf": model = makeRF() training = SymbolData.normalize(training, 99) f = Features.features(training) pca = None if (pca_num != None): pca = sklearn.decomposition.PCA(n_components=pca_num) pca.fit(f) f = pca.transform(f) model.fit(Features.features(training), SymbolData.classNumbers(training, keys)) return (model, pca)
def createFeatureLines(context, beatList, shot, leave_out_class=None): """ Returns the list of featureLines converted from the Beats in beatList """ featureLines = [] blockList = coalesceBeats(beatList) Features.initializeContextVars(context) lastShotId = -1 for block in blockList: featureLines.append( getFeatureLine(context, block, shot, lastShotId, leave_out_class)) context["BygoneBlocks"].append(block) lastShotId = block[-1].shotId return featureLines
def getFeatureNames(leave_out_class=None): """ Returns an array of feature names corresponding to the featureLine. """ names = [] featureClassList = Features.getAllFeatureClasses() context = createContext() dummy_beat = Beat("0_1\tfull_shot\tfalse\tintroduce\tperson§Nobody", context) context = Features.createBeatList(context, [dummy_beat]) Features.initializeContextVars(context) for featureClass in [x for x in featureClassList if x != leave_out_class]: feature = featureClass(context, [dummy_beat]) names += feature.getNames() return names
def getSingleFeatureLine(context, blockList, decisions, shot, leave_out_class=None): """ Returns a featureLine based on the context and the decisions. """ Features.initializeContextVars(context) lastShotId, context, blockList = applyDecisionsToBeatscript(context, blockList, decisions) # This is for preventing the classifier to cheat by using the correct class. # Activate if you're suspicious. #for beat in blockList[len(decisions)]: # beat.shot = 0 featureLine = getFeatureLine(context, blockList[len(decisions)], shot, lastShotId, leave_out_class) return featureLine
def home_page(): """ Controller for accessing different domains and creating tweet and user objects :return: homepage template """ cnx = Connection.connectToDatabase(config.db_config) if request.method == 'POST': if 'ML' in request.form: Tweets.search_tweets(cnx, 1, Connection.defineDomain(1)[1], 5) session["domain"] = 1 cnx2 = Connection.connectToDatabase(config.db_config) toptweets = Features.topTen(cnx2, 1) return render_template('tweetTest.html', your_list=toptweets[0], userlist=toptweets[1]) elif 'DB' in request.form: Tweets.search_tweets(cnx, 2, Connection.defineDomain(2)[1], 5) session["domain"] = 2 cnx2 = Connection.connectToDatabase(config.db_config) toptweets = Features.topTen(cnx2, 2) return render_template('tweetTest.html', your_list=toptweets[0], userlist=toptweets[1]) elif 'SE' in request.form: Tweets.search_tweets(cnx, 3, Connection.defineDomain(3)[1], 5) session["domain"] = 3 cnx2 = Connection.connectToDatabase(config.db_config) toptweets = Features.topTen(cnx2, 3) return render_template('tweetTest.html', your_list=toptweets[0], userlist=toptweets[1]) elif 'PR' in request.form: Tweets.search_tweets(cnx, 4, Connection.defineDomain(4)[1], 5) session["domain"] = 4 cnx2 = Connection.connectToDatabase(config.db_config) toptweets = Features.topTen(cnx2, 4) return render_template('tweetTest.html', your_list=toptweets[0], userlist=toptweets[1]) elif 'CC' in request.form: Tweets.search_tweets(cnx, 5, Connection.defineDomain(5)[1], 5) session["domain"] = 5 cnx2 = Connection.connectToDatabase(config.db_config) toptweets = Features.topTen(cnx2, 5) return render_template('tweetTest.html', your_list=toptweets[0], userlist=toptweets[1]) elif request.method == 'GET': return render_template('home.html')
def extra_features(train, test): # uni and bigrams state_info, train_ngrams = Features.wordCountsSkLearn(train, ngram_range=(1, 2), stop_words='english') _, test_ngrams = Features.wordCountsSkLearn(test, vectorizer=state_info, ngram_range=(1, 2), stop_words='english') # valence and punctuation train_valence_punct, test_valence_punct = feat5(train, test) # train matrix train_matrix = Features.append_features( [train_ngrams, train_valence_punct]) test_matrix = Features.append_features([test_ngrams, test_valence_punct]) return train_matrix, test_matrix
def apply_moving_entropy(self, input_column, dest_column=None, row_range=(0, None), window=10, no_of_bins=5): ''' Apply moving entropy as another column :param input_column: Required column to add feature engineering :param dest_column: Destination column name :param row_range: Range of rows that need to modify :param window: Window size of the calculation takes part :param no_of_bins: Number of discrete levels :return: None ''' if dest_column == None: dest_column = input_column + '_mentr_' + str(window) + '_' + str( no_of_bins) full_series = list(self._pd_frame[input_column]) filtered_series = full_series[row_range[0]:row_range[1]] result = Features.moving_entropy(series=filtered_series, window=window, no_of_bins=no_of_bins, default=True) full_series[row_range[0]:row_range[1]] = result self.add_column(column_name=dest_column, series=full_series)
def apply_moving_median(self, input_column, dest_column=None, row_range=(0, None), window=5): ''' Add moving median as another column :param input_column: Required column to add feature engineering :param row_range: Range of rows that need to modify :param window: Window size of the calculation takes part :param dest_column: Destination column name :return: None ''' if dest_column == None: dest_column = input_column + '_mm_' + str(window) full_series = list(self._pd_frame[input_column]) filtered_series = full_series[row_range[0]:row_range[1]] result = Features.moving_median(series=filtered_series, window=window, default=True) full_series[row_range[0]:row_range[1]] = result self.add_column(column_name=dest_column, series=full_series)
def apply_moving_weighted_average(self, input_column, dest_column=None, row_range=(0, None), window=5, weights=[1, 2, 3, 4, 5]): ''' Apply moving weighted average as another column :param input_column: Required column to add feature engineering :param dest_column: Destination column name :param row_range: Range of rows that need to modify :param window: Window size of the calculation takes part :param weights: list of integers :return: None ''' if dest_column == None: dest_column = input_column + '_mwa_' + str(window) full_series = list(self._pd_frame[input_column]) filtered_series = full_series[row_range[0]:row_range[1]] result = Features.moving_weighted_average(series=filtered_series, window=window, weights=weights, default=True) full_series[row_range[0]:row_range[1]] = result self.add_column(column_name=dest_column, series=full_series)
def apply_moving_median_centered_average(self, input_column, dest_column=None, row_range=(0, None), window=5, boundary=1): ''' Apply moving median centered average as another column :param input_column: Required column to add feature engineering :param dest_column: Destination column name :param row_range: Range of rows that need to modify :param window: Window size of the calculation takes part :param boundary: number of values that need to be removed from both ends of the sorted window :return: None ''' if dest_column == None: dest_column = input_column + '_mmca_' + str(window) full_series = list(self._pd_frame[input_column]) filtered_series = full_series[row_range[0]:row_range[1]] result = Features.moving_median_centered_average( series=filtered_series, window=window, boundary=boundary, default=True) full_series[row_range[0]:row_range[1]] = result self.add_column(column_name=dest_column, series=full_series)
def apply_moving_k_closest_average(self, input_column, dest_column=None, row_range=(0, None), window=5, kclosest=3): ''' Apply moving k closest average as another column :param input_column: Required column to add feature engineering :param dest_column: Destination column name :param row_range: Range of rows that need to modify :param window: Window size of the calculation takes part :param kclosest: k number of closest values to the recent occurrence including itself :return: None ''' if dest_column == None: dest_column = input_column + '_kca_' + str(window) full_series = list(self._pd_frame[input_column]) filtered_series = full_series[row_range[0]:row_range[1]] result = Features.moving_k_closest_average(series=filtered_series, window=window, kclosest=kclosest, default=True) full_series[row_range[0]:row_range[1]] = result self.add_column(column_name=dest_column, series=full_series)
def TagSentence(self, words, pos): if self.nTagged % 500 == 0: self.tagger.stdin.close() self.tagger.stdout.close() #self.tagger.kill() os.kill(self.tagger.pid, SIGTERM) #Need to do this for python 2.4 self.tagger.wait() self.GetTagger() features = [] seq_features = [] quotes = Features.GetQuotes(words) for i in range(len(words)): features = self.fe.Extract(words, pos, None, i, False) + [u'DOMAIN=Twitter'] if quotes[i]: features.append(u"QUOTED") seq_features.append(" ".join(features)) #print ("\t".join(seq_features) + "\n").encode('utf8') self.tagger.stdin.write(("\t".join(seq_features) + "\n").encode('utf8')) event_tags = [] for i in range(len(words)): event_tags.append(self.tagger.stdout.readline().rstrip('\n').strip(' ')) self.nTagged += 1 return event_tags
def getMatchingExpression(testExpr, renormalize=True): matchExprns = [] if renormalize: testExpr = SymbolData.normalizeExprs([testExpr], 299) testExpr = testExpr[0] for trainData in trainDatas: if (trainData[2] == len(testExpr.symbols)): matchExprns.append(trainData) #for trainData in trainDatas: scoreSCC = NP.zeros((len(matchExprns))) # scoreMI = NP.zeros((len(matchExprns))) k = 0 testImg = Features.getImgExpr(testExpr) for exprList in matchExprns: scoreSCC[k] = scc(testImg, exprList[1]) # scoreMI[k] = MI(testData[1],exprList[1]) k += 1 indSCC = NP.argsort(scoreSCC).astype(int) scoreSCC = scoreSCC[indSCC] scoreSCC = scoreSCC / scoreSCC[-1] # indMI = NP.argsort(scoreMI).astype(int) # scoreMI = scoreMI[indMI] # scoreMI = scoreMI/scoreMI[-1] # matchExprSortSCC = [] # for i in indSCC: # matchExprSortSCC.append(matchExprns[i]) # # return(matchExprSortSCC[-1]) return (matchExprns[indSCC[-1]])
def perceptron(data, maxIterations, featureSet): length = Features.getLength(featureSet) rate = 0.1 weights = np.zeros((1,length+1)) i = 0 while i < maxIterations: for j in range(data.shape[0]): #print weights vector = Features.getVector(data[j,0], featureSet) vector.append(length) sign = predict_one(weights, vector, 0) if (data[j,1] == '+' and sign == -1) or (data[j,1] == '-' and sign == 1): for index in vector: weights[0,index] = weights[0,index] - rate*sign i += 1 return weights
def suggest_moves(self, board): board_feature_planes = Features.make_feature_planes_stones_4liberties_4history_ko_4captures( board, board.color_to_play).astype(np.float32) Normalization.apply_featurewise_normalization_C(board_feature_planes) feed_dict = { self.feature_planes: board_feature_planes.reshape(1, self.model.N, self.model.N, self.model.Nfeat) } move_logits = self.sess.run(self.logits, feed_dict).ravel() # ravel flattens to 1D # zero out illegal moves for x in xrange(self.model.N): for y in xrange(self.model.N): ind = self.model.N * x + y if not board.play_is_legal(x, y, board.color_to_play): move_logits[ind] = -1e99 move_probs = softmax(move_logits, self.softmax_temp) sum_probs = np.sum(move_probs) if sum_probs == 0: return [] # no legal moves move_probs /= sum_probs # re-normalize probabilities good_moves = [] cum_prob = 0.0 while cum_prob < self.threshold_prob: ind = np.argmax(move_probs) x, y = ind / self.model.N, ind % self.model.N good_moves.append((x, y)) prob = move_probs[ind] cum_prob += prob move_probs[ind] = 0 return good_moves
def error_analyze(make_model, train_data, test_data, featurizer): matrices = Features.make_experiment_matrices(train_data, test_data, featurizer) model = make_model() model.fit(matrices['train_X'], matrices['train_Y']) bins = [v / 100.0 for v in range(50, 110, 5)] ext_preds = Models.extended_predict(model, matrices['test_X'], matrices['test_Y']) return Models.error_analysis(ext_preds, bins = bins)
def main(): beatscriptFile = open(sys.argv[1], "r") lines = beatscriptFile.readlines() context = readContext(lines) beatList = readBeatscript(lines, context) blockList = coalesceBeats(beatList) Features.initializeContextVars(context) dataLines = [] for block in blockList: dataLines.append(createDataLine(context, block)) context["BygoneBlocks"].append(block) outputFile = open(sys.argv[2], "w") for dataLine in dataLines: outputFile.write(DELIMITER.join([str(x) for x in dataLine]) + "\n") #outputFile.write(DELIMITER.join(dataLine) + "\n") outputFile.close()
def __init__(self, universe=None, temperature = 300): if universe == None: return Features.checkFeatures(self, universe) self.universe = universe self.temperature = temperature self.sqrt_mass = Numeric.sqrt(self.universe.masses().array) self.sqrt_mass = self.sqrt_mass[:, Numeric.NewAxis] self._forceConstantMatrix() ev = self._diagonalize() self.imaginary = Numeric.less(ev, 0.) self.frequencies = Numeric.sqrt(Numeric.fabs(ev)) / (2.*Units.pi) self.sort_index = Numeric.argsort(self.frequencies) self._scale(temperature) del self.sqrt_mass
def winnow(data, maxIterations, featureSet): length = Features.getLength(featureSet) threshold = length weights = np.ones((1,length)) i = 0 while i < maxIterations: for j in range(data.shape[0]): #print weights vector = Features.getVector(data[j,0], featureSet) sign = predict_one(weights, vector, threshold) if data[j,1] == '+' and sign == -1: for index in vector: weights[0,index] = weights[0,index]*2 elif data[j,1] == '-' and sign == 1: for index in vector: weights[0,index] = weights[0,index]/2 i += 1 return weights
def onlineFeatureLineCreator(filename, use_classified_shot = False, use_history = True): # load context and complete beatlist from file context, beatList = getContextAndBeatListFromFile(filename) Features.initializeContextVars(context) blockList = coalesceBeats(beatList) context["BygoneBlocks"] = [] for block in blockList: shot_true = block[-1].shot # get current feature line and true shot class featureLine = getFeatureLine(context, block, True, -1) features = np.array(featureLine[:-1], dtype=np.float64) shot_classified = yield features, shot_true # update block and lastShotId if use_classified_shot: for beat in block: beat.shot = shot_classified if use_history: context["BygoneBlocks"].append(block)
def timer_event(self): # get next window try: ## XXX MARIO # data = self.data.next() data = self.data.next_in_selection() # --> video finished except StopIteration: return y = data.points x = range(len(y)) ## remove old line try: old_line = self.lines.popleft() ## XXX MARIO for l in old_line: self.ax.lines.remove(l) # --> no old lines, so we don't have to remove any except IndexError: pass ## * draw new line * line = self.ax.plot( x, y, self.line_type, c=data.color ) self.lines.append(line) ## draw label # self.label = self.ax.text(0.02, 0.98, data.label, # horizontalalignment='left', # verticalalignment='top', # color=data.color, # transform = self.ax.transAxes) self.label.set_text(data.label) self.label.set_color(data.color) ## progress self.status_label.set_text(str(data.progress[0]) + " / " + str(data.progress[1]) + ", " + data.label ) print data.label, Features.calc_mean(y) plt.draw() ## XXX if ( data.label in ("up", "down")): time.sleep(2) ## set new timer self._set_timer(self.speed)
def __call__(self, **options): self.setCallOptions(options) Features.checkFeatures(self, self.universe) configuration = self.universe.configuration() fixed = self.universe.getAtomBooleanArray('fixed') nt = self.getOption('threads') evaluator = self.universe.energyEvaluator(threads=nt).CEvaluator() args =(self.universe, configuration.array, fixed.array, evaluator, self.getOption('steps'), self.getOption('step_size'), self.getOption('convergence'), self.getActions(), 'Conjugate gradient minimization with ' + self.optionString(['convergence', 'step_size', 'steps'])) if self.getOption('background'): if not threading: raise OSError("background processing not available") return MinimizerThread(self.universe, conjugateGradient, args) else: apply(conjugateGradient, args)
def count_unigrams(outpath): tw_cter, twitter_cts = Features.wordCountsSkLearn(Features.getX(tw), stop_words = 'english') blog_cter, blog_cts = Features.wordCountsSkLearn(Features.getX(blog), stop_words = 'english') # Total number of non-stop-word unigrams unigrams = set(tw_cter.vocabulary_.keys() + blog_cter.vocabulary_.keys()) print "Data has %d distinct unigrams" % len(unigrams) # Distribution of unigram cts twitter_unigram_histo = histogram_cts(twitter_cts) blog_unigram_histo = histogram_cts(blog_cts) unigram_histo = histo_to_tuples(twitter_unigram_histo, 'twitter+wiki') + \ histo_to_tuples(blog_unigram_histo, 'blog') # Write out to csv with open(outpath, 'w') as unigram_histo_file: for elem in unigram_histo: unigram_histo_file.write("%s,%d,%f\n" % elem) return 0
def main(): arguments = validateInput(sys.argv) maxIterations, regularization, stepSize, lmbd, featureSet = arguments print maxIterations, regularization, stepSize, lmbd, featureSet trainData = readFile('train.csv') validationData = readFile('validation.csv') testData = readFile('test.csv') trainSize = trainData.shape[0] validationSize = validationData.shape[0] testSize = testData.shape[0] print "Number of training examples: " + str(trainSize) # Extract Features Features.extractFeatures(trainData[:,0], featureSet) print "Extracted Features:" if featureSet == 1 or featureSet == 3: print "Unigram: " + str(Features.getLength(1)) if featureSet == 2 or featureSet == 3: print "Bigram: " + str(Features.getLength(2)) # Construct Input Matrices X xTrain = Features.getMatrix(trainData[:,0], featureSet) print "Train Matrix built" xValidation = Features.getMatrix(validationData[:,0], featureSet) print "Validation Matrix built" xTest = Features.getMatrix(testData[:,0], featureSet) print "Test Matrix built" yTrain = extractLabel(trainData[:,1]) yVailidation = extractLabel(validationData[:,1]) yTest = extractLabel(testData[:,1]) # Train the model theta = GD(xTrain, yTrain, trainSize, maxIterations, regularization, stepSize, lmbd, featureSet) print "Final Theta: " + str(theta) # Classify trainResult = predict(xTrain, trainSize, theta, featureSet) print "Train Result: " + str(trainResult) validationResult = predict(xValidation, validationSize, theta, featureSet) print "Validation Result: " + str(validationResult) testResult = predict(xTest, testSize, theta, featureSet) print "Test Result: " + str(testResult) # Performance print "\nPerformance on training data:" performance(trainResult, trainData[:,1]) print "\nPerformance on validation data:" performance(validationResult, validationData[:,1]) print "\nPerformance on test data:" performance(testResult, testData[:,1])
def pick_model_move(self, color): if self.model.Nfeat == 15: board_feature_planes = Features.make_feature_planes_stones_3liberties_4history_ko(self.board, color) Normalization.apply_featurewise_normalization_B(board_feature_planes) elif self.model.Nfeat == 21: board_feature_planes = Features.make_feature_planes_stones_4liberties_4history_ko_4captures(self.board, color).astype(np.float32) Normalization.apply_featurewise_normalization_C(board_feature_planes) else: assert False feature_batch = Symmetry.make_symmetry_batch(board_feature_planes) feed_dict = {self.feature_planes: feature_batch} logit_batch = self.sess.run(self.logits, feed_dict) move_logits = Symmetry.average_plane_over_symmetries(logit_batch, self.model.N) softmax_temp = 1.0 move_probs = softmax(move_logits, softmax_temp) # zero out illegal moves for x in xrange(self.model.N): for y in xrange(self.model.N): ind = self.model.N * x + y if not self.board.play_is_legal(x, y, color): move_probs[ind] = 0 sum_probs = np.sum(move_probs) if sum_probs == 0: return Move.Pass() # no legal moves, pass move_probs /= sum_probs # re-normalize probabilities pick_best = True if pick_best: move_ind = np.argmax(move_probs) else: move_ind = sample_from(move_probs) move_x = move_ind / self.model.N move_y = move_ind % self.model.N self.last_move_probs = move_probs.reshape((self.board.N, self.board.N)) return Move(move_x, move_y)
def getFeatureLine(context, block, shot, lastShotId, leave_out_class=None): """ This function creates a featureLine. This is done by calculating getNumbers() for all Feature-Classes in Features.py and appending the desired class. A featureLine consists of several Numbers and a String at the end for the class. """ line = [] featureClassList = Features.getAllFeatureClasses() context = Features.createBeatList(context, block) # use all features except leave_out_class if given #for featureClass in [x for x in featureClassList if x != leave_out_class]: # use only features which contribute significant information for featureClass in [featureClassList[x] for x in range(len(featureClassList)) if x in [9, 10, 11, 12, 13, 15, 19, 25, 27, 28, 30, 31, 32, 33, 37, 39, 41, 42]]: feature = featureClass(context, block) line += feature.getNumbers() if shot: line.append(SHOT_NAMES[block[0].shot]) else:#is there a cut? line.append(str(lastShotId != block[0].shotId)) return line
def get_position_eval(self): #assert self.model.Nfeat == 21 #board_feature_planes = Features.make_feature_planes_stones_4liberties_4history_ko_4captures(self.board, self.board.color_to_play).astype(np.float32) #Normalization.apply_featurewise_normalization_C(board_feature_planes) assert self.model.Nfeat == 22 board_feature_planes = Features.make_feature_planes_stones_4liberties_4history_ko_4captures_komi(self.board, self.board.color_to_play, self.komi).astype(np.float32) Normalization.apply_featurewise_normalization_D(board_feature_planes) feature_batch = Symmetry.make_symmetry_batch(board_feature_planes) feed_dict = {self.feature_planes: feature_batch} probs_batch = self.sess.run(self.probs_op, feed_dict) prob = average_probs_over_symmetries(probs_batch) if self.board.color_to_play == Color.White: prob *= -1 return prob