def __CVD( data, exp ):
    kn = Evaluation.Evaluator()    
    X,Y,y_raw = Features.getSamples( kn, data )
    data.maxWords = 10000
    kf = StratifiedKFold( n_splits=10, shuffle=True )
    k = 0 

    for train, test in kf.split( X, y_raw ):
        print( "K-Fold: " + str( k + 1 ) );
        x_train_raw, x_test_raw = X[train], X[test]
        y_train, y_test = Y[train], Y[test]
        x_train, x_test = Features.getCVvectors( x_train_raw, x_test_raw, data )
        denseSizes = [512,1024]
        batches = [64,128]
        dropouts = [2,3]
        param_grid = dict( batch_size=batches, denseSize=denseSizes, 
            dropout=dropouts, input_length=[len(x_train[0])],
            output_length=[len(y_train[0])] )
        model = KerasClassifier(build_fn=Models.create_ann_model, epochs=30, verbose=2)

        y_ints = [y.argmax() for y in y_train]
        cweights = class_weight.compute_class_weight( 'balanced', np.unique( y_ints ), y_ints )

        grid = GridSearchCV(estimator=model, param_grid=param_grid)
        grid_result = grid.fit(x_train, y_train, class_weight=cweights)
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        model, scores = kn.evaluateModel( x_test, y_test, grid.best_estimator_.model, data, k )
        k = k + 1 

    kn.saveResults( exp )
def feat5(train, test):
    train_valence, test_valence = feat1(train, test)
    puncter, train_punct = Features.punctuation(train)
    _, test_punct = Features.punctuation(test, vectorizer = puncter)
    train_matrix = Features.append_features([train_valence, train_punct])
    test_matrix = Features.append_features([test_valence, test_punct])
    return train_matrix, test_matrix
def __EmbeddedRNN( data, exp, filepath, network ):
    kn = Evaluation.Evaluator()    
    X,Y,y_raw = Features.getSamples( kn, data )
    data.maxWords = 10000
    kf = StratifiedKFold( n_splits=10, shuffle=True )
    k = 0 

    for train, test in kf.split( X, y_raw ):
        print( "K-Fold: " + str( k + 1 ) );
        x_train_raw, x_test_raw = X[train], X[test]
        y_train, y_test = Y[train], Y[test]
        Models.embedding, x_train, x_test = Features.getEmbedded( x_train_raw, x_test_raw, 
            y_train, y_test, y_raw, filepath, kn )
        batches = [64,218]
        neurons = [100,200]
        dropouts = [2,3]
        param_grid = dict( batch_size=batches, neuron=neurons, dropout=dropouts, output_size=[len(y_train[0])] )
        model = None
        if network == 'lstm':
            model = KerasClassifier(build_fn=Models.create_lstm_model, epochs=30, verbose=2)
        else:
            model = KerasClassifier(build_fn=Models.create_gru_model, epochs=30, verbose=2)

        y_ints = [y.argmax() for y in y_train]
        cweights = class_weight.compute_class_weight( 'balanced', np.unique( y_ints ), y_ints )

        grid = GridSearchCV(estimator=model, param_grid=param_grid)
        grid_result = grid.fit(x_train, y_train, class_weight=cweights)
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        model, scores = kn.evaluateModel( x_test, y_test, grid.best_estimator_.model, data, k )
        k = k + 1 

    kn.saveResults( exp )
def feat2(train, test):
    state_info, train_matrix = Features.tfIdfSkLearn(train,
                                                     stop_words="english")
    _, test_matrix = Features.tfIdfSkLearn(test,
                                           vectorizer=state_info,
                                           stop_words='english')
    return train_matrix, test_matrix
Exemple #5
0
    def fromContracts(cls, names, lookback, interval):
        adjustedContracts = AdjustedContracts.initialize(names)

        DATA = [[[]]]
        TARGETS = [[[]]]

        for adjustedContract in adjustedContracts:
            dataPoints, targets = Features.concatenateDataPoints(
                Features.computeFeatures(adjustedContract), lookback)
            DATA.append(dataPoints)
            TARGETS.append(targets)
            print(adjustedContract.name)

        DATA.pop(0)
        TARGETS.pop(0)

        Export.exportTXT(
            DATA,
            "C:/Users/dream/Desktop/4th Year/Final Project/DATA BACKUP/MLDataSet/nonsplitDataPoints_"
            + str(lookback) + ".txt")

        Export.exportTXT(
            TARGETS,
            "C:/Users/dream/Desktop/4th Year/Final Project/DATA BACKUP/MLDataSet/nonsplitTargets_"
            + str(lookback) + ".txt")

        return cls(DATA, TARGETS, interval)
Exemple #6
0
def feat5(train, test):
    train_valence, test_valence = feat1(train, test)
    puncter, train_punct = Features.punctuation(train)
    _, test_punct = Features.punctuation(test, vectorizer=puncter)
    train_matrix = Features.append_features([train_valence, train_punct])
    test_matrix = Features.append_features([test_valence, test_punct])
    return train_matrix, test_matrix
Exemple #7
0
def Process1(df):
    pri_id = "企业名称"
    res = pd.DataFrame()
    res[pri_id] = df[pri_id].unique()
    # 转换币种
    df = prep.Convert_money(df)
    # 提取注册资金特征(最大值,最小值,均值,方差)
    res = pd.merge(res,fea.GetValAvg(df,pri_id,"注册资金(元)"),on=pri_id)
    res = pd.merge(res,fea.GetValMaxMin(df,pri_id,"注册资金(元)"),on=pri_id)
    res = pd.merge(res,fea.GetValVar(df,pri_id,"注册资金(元)"),on=pri_id)

    # 提取类别特征
    num_fea = ['注册资金(元)',"出资比例"]
    cat_fea = [col for col in df.columns if col != pri_id and col not in num_fea]
    for col in cat_fea:
        res = pd.merge(res,fea.GetCategroicalCount(df,pri_id,col),on=pri_id)

    # 法定代表人和首席代表标志为空统计
    res = pd.merge(res,fea.GetValNaCount(df,pri_id,"法定代表人标志","姓名"),on=pri_id)
    res = pd.merge(res,fea.GetValNaCount(df,pri_id,"首席代表标志","姓名"),on=pri_id)

    # 统计 相应职务个树
    res = pd.merge(res,fea.CatRowsToCols(df,pri_id,"职务","姓名"))

    # 提取出资比例(最大值,最小值,均值,方差)
    res = pd.merge(res,fea.GetValAvg(df,pri_id,"出资比例"),on=pri_id)
    res = pd.merge(res,fea.GetValMaxMin(df,pri_id,"出资比例"),on=pri_id)
    res = pd.merge(res,fea.GetValVar(df,pri_id,"出资比例"),on=pri_id)

    return res
Exemple #8
0
def _F_LabelCount():
    if os.path.exists(data_path + "data/_F_d_label_count.feather"):
        df = feather.read_dataframe(data_path +
                                    "data/_F_d_label_count.feather")
        return df

    temp = pd.DataFrame()
    temp[pri_id] = pd.concat((_train[pri_id], _test[pri_id]))

    # trans_type1
    temp = temp.merge(_F.DangerousLabel(trans_info, pri_id, 'trans_type1',
                                        'ced62357ad496957', 0.8),
                      on=pri_id,
                      how='left')

    # trans_type2
    temp = temp.merge(_F.DangerousLabel(trans_info, pri_id, 'trans_type2', 104,
                                        0.8),
                      on=pri_id,
                      how='left')

    # amt_src1
    temp = temp.merge(_F.DangerousLabel(trans_info, pri_id, 'amt_src1',
                                        'c4ec9622cf5c6e55', 0.8),
                      on=pri_id,
                      how='left')

    # amt_src2
    temp = temp.merge(_F.DangerousLabel(trans_info, pri_id, 'amt_src2',
                                        'c4ec9622cf5c6e55', 0.8),
                      on=pri_id,
                      how='left')
    feather.write_dataframe(temp, data_path + "data/_F_d_label_count.feather")

    return temp
def user_features():
    """
    Controller which handles features like similar users, sentiment analysis, associated tags
    :return: corresponding page according to user`s selection of feature
    """
    cnx = Connection.connectToDatabase(config.db_config)
    if request.method == 'POST':
        if 'SU' in request.form:
            users_list = Features.similarUsers(cnx, session.get("user", None),
                                               session.get("domain", None))
            return render_template('similarusers.html', users_list=users_list)
        elif 'AT' in request.form:
            tags_list = Features.tagsAssoPerson(cnx, session.get("user", None),
                                                session.get("domain", None))
            return render_template('associatedtags.html', tags_list=tags_list)
        elif 'PS' in request.form:
            mentions, sentiment = Features.peopleSaying(
                cnx, session.get("user", None))
            influence = Features.whatsMyInfluence(cnx,
                                                  session.get("user", None))
            viral = Features.viralUserTweets(cnx, session.get("user", None))
            return render_template("sentiment.html",
                                   mentions=mentions,
                                   sentiment=sentiment,
                                   influence=influence,
                                   viral=viral)
Exemple #10
0
def feat6_generic(train, test, train_pos, test_pos):
    train_f5, test_f5 = feat5(train, test)
    cter, train_cts = Features.keyPOSNGrams(train_pos, ["jj.*", "vb.*"], tf_idf = True)
    _, test_cts = Features.keyPOSNGrams(test_pos, ["jj.*", "vb.*"], vectorizer = cter, tf_idf= True)
    train_matrix = Features.append_features([train_f5, train_cts])
    test_matrix = Features.append_features([test_f5, test_cts])
    return train_matrix, test_matrix
Exemple #11
0
def WStack():
    import Wesleyan
    data = Wesleyan.Wesleyan()
    kn = Evaluation.Evaluator()    
    exp = "WStack"
    filepath = 'enwiki_20180420_300d.txt'

    X,Y,y_raw = Features.getSamples( kn, data )
    data.maxWords = 10000
    kf = StratifiedKFold( n_splits=10, shuffle=True )
    k = 0 
    for train, test in kf.split( X, y_raw ):
        print( "K-Fold: " + str( k + 1 ) );
        x_train_raw, x_test_raw = X[train], X[test]
        y_train, y_test = Y[train], Y[test]
        y_ints = [y.argmax() for y in y_train]
        cweights = class_weight.compute_class_weight( 'balanced', np.unique( y_ints ), y_ints )
        
        Models.embedding, x_train, x_test = Features.getEmbedded( x_train_raw, x_test_raw, 
            y_train, y_test, y_raw, filepath, kn )

        y_pred_train = None
        y_pred_test = None       
        def do_cnn(): 
            model1 = Models.create_cnn_model( pool_size=3, layer_size=128, output_size=len(y_train[0]) )
            model1.fit(x_train, y_train, epochs=15, verbose=2, batch_size=32, class_weight=cweights)

            y_pred_train = model1.predict( x_train, verbose=0 )
            y_pred_test = model1.predict( x_test, verbose=0 )
            model1 = None

        do_cnn()
 
        model2 = Models.create_ann_model( dropout=3, denseSize=512, output_length=len(y_train[0]) )
        model2.fit(x_train, y_train, batch_size=64, verbose=2, epochs=30, class_weight=cweights)
        
        y_pred_train2 = model2.predict( x_train, verbose=0 )
        y_pred_test2 = model2.predict( x_test, verbose=0 )
        model2 = None
        
        x_train, x_test = Features.getCVvectors( x_train_raw, x_test_raw, data )
        model3 = Models.create_ann_model( batch_size=64, denseSize=1024,
            dropout=3, input_length=[len(x_train[0])],
            output_length=[len(y_train[0])] )
        model3.fit(x_train, y_train, epochs=100, class_weight=cweights)

        y_pred_train3 = model3.predict( x_train, verbose=0 )
        y_pred_test3 = model3.predict( x_test, verbose=0 )

        new_x_train = np.stack( (y_pred_train, y_pred_train2, y_train_3 ), axis=-1)
        new_x_test = np.stack( (y_pred_test, y_pred_test2, y_test_3 ), axis=-1)
        
        model = Models.create_stack_model( input_size=len(new_x_train[0]), output_size=len(y_train[0]) )
        history = model.fit(new_x_train, y_train, epochs=100, verbose=2, batch_size=128, class_weight=cweights )

        model, scores = kn.evaluateModel( new_x_test, y_test, model, data, k )
        k = k + 1 
    
    kn.saveResults( exp )
Exemple #12
0
    def display_data(self):

        logging.info('DISPLAYING TEXELS')

        Features.show_texel_list(self.texel_features)
        self.mytimer.tick()

        logging.info('DISPLAYING DONE')
def feat7(train, test):
    normal_train, train_pos = map(list, zip(*train))
    normal_test, test_pos = map(list, zip(*test))
    train_f5, test_f5 = feat5(normal_train, normal_test)
    cter, train_cts = Features.keyPOSNGrams(train_pos, ["jj.*", "vb.*"], tf_idf = True, ngram_range = (1, 2), stop_words = 'english')
    _, test_cts = Features.keyPOSNGrams(test_pos, ["jj.*", "vb.*"], vectorizer = cter, tf_idf= True, ngram_range = (1, 2), stop_words = 'english')
    train_matrix = Features.append_features([train_f5, train_cts])
    test_matrix = Features.append_features([test_f5, test_cts])
    return train_matrix, test_matrix
def feat4(train, test):
    # feature set 3
    train_f3, test_f3 = feat3(train, test)
    # punctuation
    puncter, train_punct = Features.punctuation(train)
    _, test_punct = Features.punctuation(test, vectorizer = puncter)
    train_matrix = Features.append_features([train_f3, train_punct])
    test_matrix = Features.append_features([test_f3, test_punct])
    return train_matrix, test_matrix
def feat7(train, test):
    # feature set 3
    train_f5, test_f5 = feat5(train, test)
    # punctuation
    puncter, train_punct = Features.bagOfWordsSkLearn(train)
    _, test_punct = Features.bagOfWordsSkLearn(test, vectorizer = puncter)
    train_matrix = Features.append_features([train_f5, train_punct])
    test_matrix = Features.append_features([test_f5, test_punct])
    return train_matrix, test_matrix
def feat7(train, test):
    # feature set 3
    train_f5, test_f5 = feat5(train, test)
    # punctuation
    puncter, train_punct = Features.bagOfWordsSkLearn(train)
    _, test_punct = Features.bagOfWordsSkLearn(test, vectorizer=puncter)
    train_matrix = Features.append_features([train_f5, train_punct])
    test_matrix = Features.append_features([test_f5, test_punct])
    return train_matrix, test_matrix
def feat3(train, test):
    # valence info
    train_valence, test_valence = feat1(train, test)
    # tf idf info
    train_cts, test_cts = feat2(train, test)
    # combined info
    train_matrix = Features.append_features([train_valence, train_cts])
    test_matrix = Features.append_features([test_valence, test_cts])
    return train_matrix, test_matrix
def feat6(train, test):
    normal_train, train_pos = map(list, zip(*train))
    normal_test, test_pos = map(list, zip(*test))
    train_f5, test_f5 = feat5(normal_train, normal_test)
    cter, train_cts = Features.keyPOSNGrams(train_pos, ["jj.*", "vb.*"], tf_idf = True)
    _, test_cts = Features.keyPOSNGrams(test_pos, ["jj.*", "vb.*"], vectorizer = cter, tf_idf= True)
    train_matrix = Features.append_features([train_f5, train_cts])
    test_matrix = Features.append_features([test_f5, test_cts])
    return train_matrix, test_matrix
Exemple #19
0
def feat3(train, test):
    # valence info
    train_valence, test_valence = feat1(train, test)
    # tf idf info
    train_cts, test_cts = feat2(train, test)
    # combined info
    train_matrix = Features.append_features([train_valence, train_cts])
    test_matrix = Features.append_features([test_valence, test_cts])
    return train_matrix, test_matrix
Exemple #20
0
def feat4(train, test):
    # feature set 3
    train_f3, test_f3 = feat3(train, test)
    # punctuation
    puncter, train_punct = Features.punctuation(train)
    _, test_punct = Features.punctuation(test, vectorizer=puncter)
    train_matrix = Features.append_features([train_f3, train_punct])
    test_matrix = Features.append_features([test_f3, test_punct])
    return train_matrix, test_matrix
def extra_features(train, test):
    # uni and bigrams
    state_info, train_ngrams = Features.wordCountsSkLearn(train, ngram_range = (1, 2), stop_words = 'english')
    _, test_ngrams = Features.wordCountsSkLearn(test, vectorizer = state_info, ngram_range = (1, 2), stop_words = 'english')
    # valence and punctuation
    train_valence_punct, test_valence_punct = feat5(train, test)
    # train matrix
    train_matrix = Features.append_features([train_ngrams, train_valence_punct])
    test_matrix = Features.append_features([test_ngrams, test_valence_punct])
    return train_matrix, test_matrix
Exemple #22
0
 def __init__(self):
   self.feature_vec = [features.CrossTermX1X3(), features.SinX2(),
                       features.SquareX4(), features.Identity()]
   self.feature_weights = [0.1, -2, -0.3, 3]
   self.noise_model = noise.NoiseModel()
   self.max_x1 = 10
   self.max_x2 = 10
   self.max_x3 = 10
   self.max_x4 = 10
   self.saver = saver.DataSaver('data', 'submission_data.pkl')
Exemple #23
0
def count_labels(outpath):
    tw_cts = Counter(Features.getY(tw))
    blog_cts = Counter(Features.getY(blog))
    cts = zip(["twitter+wiki", "blog"], [tw_cts, blog_cts])
    # Write out to csv
    with open(outpath, 'w') as labels_histo_file:
        for src, counter in cts:
            for k, v in counter.iteritems():
                labels_histo_file.write("%s,%s,%d\n" % (src, k, v))
    return 0
Exemple #24
0
def feat6_generic(train, test, train_pos, test_pos):
    train_f5, test_f5 = feat5(train, test)
    cter, train_cts = Features.keyPOSNGrams(train_pos, ["jj.*", "vb.*"],
                                            tf_idf=True)
    _, test_cts = Features.keyPOSNGrams(test_pos, ["jj.*", "vb.*"],
                                        vectorizer=cter,
                                        tf_idf=True)
    train_matrix = Features.append_features([train_f5, train_cts])
    test_matrix = Features.append_features([test_f5, test_cts])
    return train_matrix, test_matrix
Exemple #25
0
def feat1(train, test):
    vectorizer, train_matrix = Features.valenceByFrequency(
        train,
        vectorizer=None,
        cache_valence=cache_valence,
        stop_words='english')
    _, test_matrix = Features.valenceByFrequency(test,
                                                 vectorizer=vectorizer,
                                                 cache_valence=cache_valence,
                                                 stop_words='english')
    return train_matrix, test_matrix
Exemple #26
0
def getSingleFeatureLineFromFile(file, decisions, shot, leave_out_class=None):
    """
    This is a less troublesome but slow method to get a featureLine.
    """
    beatList, context = getContextAndBeatListFromFile(file)
    blockList = coalesceBeats(beatList)
    Features.initializeContextVars(context)
    lastShotId, context, blockList = applyDecisionsToBeatscript(context, blockList,
        decisions)
    featureLine = getFeatureLine(context, blockList[len(decisions)], shot, lastShotId,
        leave_out_class)
    return featureLine
Exemple #27
0
def feat6(train, test):
    normal_train, train_pos = map(list, zip(*train))
    normal_test, test_pos = map(list, zip(*test))
    train_f5, test_f5 = feat5(normal_train, normal_test)
    cter, train_cts = Features.keyPOSNGrams(train_pos, ["jj.*", "vb.*"],
                                            tf_idf=True)
    _, test_cts = Features.keyPOSNGrams(test_pos, ["jj.*", "vb.*"],
                                        vectorizer=cter,
                                        tf_idf=True)
    train_matrix = Features.append_features([train_f5, train_cts])
    test_matrix = Features.append_features([test_f5, test_cts])
    return train_matrix, test_matrix
Exemple #28
0
def createDataLine(context, block, leaveout=-1):
    dataLine = [str(block[0].shotId) + "_" + str(block[0].beatId), str(block[0].shot)]
    featureClassList = Features.getAllFeatureClasses()
    context = Features.createBeatList(context, block)
    for featureClass in featureClassList:
        feature = featureClass(context, block)
        dataLine += feature.getNumbers()
        # activate to generate a human readable featureLine
        #dataLine.append(feature.getText())
    if leaveout >= 0:
        dataLine.pop(leaveout)
    return dataLine
Exemple #29
0
def _F_nunique_ratio(n=3):

    if os.path.exists(data_path + "data/_F_nunique_ratio.feather"):
        df = feather.read_dataframe(data_path +
                                    "data/_F_nunique_ratio.feather")
        return df

    temp = pd.DataFrame()
    temp[pri_id] = pd.concat((_train[pri_id], _test[pri_id]))

    # 增加天和月份的片
    df = pd.concat((op_info[[pri_id, 'day']], trans_info[[pri_id, 'day']]))
    month = _F.Day2Month(df, 'day')
    temp = pd.merge(temp,
                    _F.TopN_col_distinct_ratio(month[[pri_id, 'month', 'day']],
                                               pri_id, 'month', 'day', n),
                    on=pri_id,
                    how='left')

    df = pd.concat((op_info[[pri_id, 'time',
                             'day']], trans_info[[pri_id, 'time', 'day']]))
    df['day_period'] = df['time'].apply(_F.TimeInterval)
    temp = pd.merge(
        temp,
        _F.TopN_col_distinct_ratio(df[[pri_id, 'day_period', 'day']], pri_id,
                                   'day_period', 'day', n))

    # topN ratio
    for col in [
            'geo_code', 'ip1', 'ip1_sub', 'mac1', 'merchant', 'ip2', 'ip2_sub',
            'mode', 'mac2', 'os', 'channel', 'trans_type1', 'trans_type2',
            'code1', 'code2', 'market_code', 'market_type', 'device_code1',
            'device_code2', 'device_code3', 'wifi'
    ]:
        if col in [
                'merchant', 'channel', 'trans_type1', 'trans_type2', 'code1',
                'code2', 'market_code', 'market_type'
        ]:
            df = trans_info[[pri_id, col, 'day']]
        elif col in ['ip2', 'ip2_sub', 'mode', 'mac2', 'os', 'wifi']:
            df = op_info[[pri_id, col, 'day']]
        else:
            df = pd.concat((op_info[[pri_id, col,
                                     'day']], trans_info[[pri_id, col,
                                                          'day']]))
        temp = pd.merge(temp,
                        _F.TopN_col_distinct_ratio(df, pri_id, col, 'day', n),
                        on=pri_id,
                        how='left')

    feather.write_dataframe(temp, data_path + "data/_F_nunique_ratio.feather")
    return temp
def flag_tweet():
    """
    Controller which handles tweet delete.
    :return: same page i.e refreshes the page
    """
    cnx = Connection.connectToDatabase(config.db_config)
    if request.method == 'POST':
        Features.remove_tweet(cnx, list(request.form.keys())[0])
        cnx2 = Connection.connectToDatabase(config.db_config)
        toptweets = Features.topTen(cnx2, session.get("domain", None))
        return render_template('tweetTest.html',
                               your_list=toptweets[0],
                               userlist=toptweets[1])
Exemple #31
0
def classify(data, weights, featureSet, algorithm):
    length = Features.getLength(featureSet)
    results = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        if algorithm == 1:
            vector = Features.getVector(data[i, 0], featureSet)
            vector.append(length)
            results[i] = predict_one(weights, vector, 0)
        else:
            vector = Features.getVector(data[i, 0], featureSet)
            results[i] = predict_one(weights, vector, length)

    return results
Exemple #32
0
def classify(data, weights, featureSet, algorithm):
    length = Features.getLength(featureSet)
    results = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        if algorithm == 1:
            vector = Features.getVector(data[i,0], featureSet)
            vector.append(length)
            results[i] = predict_one(weights, vector, 0)
        else:
            vector = Features.getVector(data[i,0], featureSet)
            results[i] = predict_one(weights, vector, length)
        
    return results    
def train(model, training, keys, pca_num=None):
    if model == "1nn":
        model = OneNN()
    elif model == "rf":
        model = makeRF()
    training = SymbolData.normalize(training, 99)
    f = Features.features(training)
    pca = None
    if (pca_num != None):
        pca = sklearn.decomposition.PCA(n_components=pca_num)
        pca.fit(f)
        f = pca.transform(f)
    model.fit(Features.features(training), SymbolData.classNumbers(training, keys))
    return (model, pca)
Exemple #34
0
def createFeatureLines(context, beatList, shot, leave_out_class=None):
    """
    Returns the list of featureLines converted from the Beats in beatList
    """
    featureLines = []
    blockList = coalesceBeats(beatList)
    Features.initializeContextVars(context)
    lastShotId = -1
    for block in blockList:
        featureLines.append(
            getFeatureLine(context, block, shot, lastShotId, leave_out_class))
        context["BygoneBlocks"].append(block)
        lastShotId = block[-1].shotId
    return featureLines
Exemple #35
0
def getFeatureNames(leave_out_class=None):
    """
    Returns an array of feature names corresponding to the featureLine.
    """
    names = []
    featureClassList = Features.getAllFeatureClasses()
    context = createContext()
    dummy_beat = Beat("0_1\tfull_shot\tfalse\tintroduce\tperson§Nobody", context)
    context = Features.createBeatList(context, [dummy_beat])
    Features.initializeContextVars(context)
    for featureClass in [x for x in featureClassList if x != leave_out_class]:
        feature = featureClass(context, [dummy_beat])
        names += feature.getNames()
    return names
Exemple #36
0
def getSingleFeatureLine(context, blockList, decisions, shot, leave_out_class=None):
    """
    Returns a featureLine based on the context and the decisions.
    """
    Features.initializeContextVars(context)
    lastShotId, context, blockList = applyDecisionsToBeatscript(context, blockList,
        decisions)
    # This is for preventing the classifier to cheat by using the correct class.
    # Activate if you're suspicious.
    #for beat in blockList[len(decisions)]:
    #    beat.shot = 0
    featureLine = getFeatureLine(context, blockList[len(decisions)], shot, lastShotId,
        leave_out_class)
    return featureLine
def home_page():
    """
    Controller for accessing different domains and creating tweet and user objects
    :return: homepage template
    """
    cnx = Connection.connectToDatabase(config.db_config)
    if request.method == 'POST':
        if 'ML' in request.form:
            Tweets.search_tweets(cnx, 1, Connection.defineDomain(1)[1], 5)
            session["domain"] = 1
            cnx2 = Connection.connectToDatabase(config.db_config)
            toptweets = Features.topTen(cnx2, 1)
            return render_template('tweetTest.html',
                                   your_list=toptweets[0],
                                   userlist=toptweets[1])
        elif 'DB' in request.form:
            Tweets.search_tweets(cnx, 2, Connection.defineDomain(2)[1], 5)
            session["domain"] = 2
            cnx2 = Connection.connectToDatabase(config.db_config)
            toptweets = Features.topTen(cnx2, 2)
            return render_template('tweetTest.html',
                                   your_list=toptweets[0],
                                   userlist=toptweets[1])
        elif 'SE' in request.form:
            Tweets.search_tweets(cnx, 3, Connection.defineDomain(3)[1], 5)
            session["domain"] = 3
            cnx2 = Connection.connectToDatabase(config.db_config)
            toptweets = Features.topTen(cnx2, 3)
            return render_template('tweetTest.html',
                                   your_list=toptweets[0],
                                   userlist=toptweets[1])
        elif 'PR' in request.form:
            Tweets.search_tweets(cnx, 4, Connection.defineDomain(4)[1], 5)
            session["domain"] = 4
            cnx2 = Connection.connectToDatabase(config.db_config)
            toptweets = Features.topTen(cnx2, 4)
            return render_template('tweetTest.html',
                                   your_list=toptweets[0],
                                   userlist=toptweets[1])
        elif 'CC' in request.form:
            Tweets.search_tweets(cnx, 5, Connection.defineDomain(5)[1], 5)
            session["domain"] = 5
            cnx2 = Connection.connectToDatabase(config.db_config)
            toptweets = Features.topTen(cnx2, 5)
            return render_template('tweetTest.html',
                                   your_list=toptweets[0],
                                   userlist=toptweets[1])
    elif request.method == 'GET':
        return render_template('home.html')
def extra_features(train, test):
    # uni and bigrams
    state_info, train_ngrams = Features.wordCountsSkLearn(train,
                                                          ngram_range=(1, 2),
                                                          stop_words='english')
    _, test_ngrams = Features.wordCountsSkLearn(test,
                                                vectorizer=state_info,
                                                ngram_range=(1, 2),
                                                stop_words='english')
    # valence and punctuation
    train_valence_punct, test_valence_punct = feat5(train, test)
    # train matrix
    train_matrix = Features.append_features(
        [train_ngrams, train_valence_punct])
    test_matrix = Features.append_features([test_ngrams, test_valence_punct])
    return train_matrix, test_matrix
    def apply_moving_entropy(self,
                             input_column,
                             dest_column=None,
                             row_range=(0, None),
                             window=10,
                             no_of_bins=5):
        '''
        Apply moving entropy as another column

        :param input_column: Required column to add feature engineering
        :param dest_column: Destination column name
        :param row_range: Range of rows that need to modify
        :param window: Window size of the calculation takes part
        :param no_of_bins: Number of discrete levels
        :return: None
        '''

        if dest_column == None:
            dest_column = input_column + '_mentr_' + str(window) + '_' + str(
                no_of_bins)

        full_series = list(self._pd_frame[input_column])
        filtered_series = full_series[row_range[0]:row_range[1]]
        result = Features.moving_entropy(series=filtered_series,
                                         window=window,
                                         no_of_bins=no_of_bins,
                                         default=True)
        full_series[row_range[0]:row_range[1]] = result
        self.add_column(column_name=dest_column, series=full_series)
    def apply_moving_median(self,
                            input_column,
                            dest_column=None,
                            row_range=(0, None),
                            window=5):
        '''
        Add moving median as another column

        :param input_column: Required column to add feature engineering
        :param row_range: Range of rows that need to modify
        :param window: Window size of the calculation takes part
        :param dest_column: Destination column name
        :return: None
        '''

        if dest_column == None:
            dest_column = input_column + '_mm_' + str(window)

        full_series = list(self._pd_frame[input_column])
        filtered_series = full_series[row_range[0]:row_range[1]]
        result = Features.moving_median(series=filtered_series,
                                        window=window,
                                        default=True)
        full_series[row_range[0]:row_range[1]] = result
        self.add_column(column_name=dest_column, series=full_series)
    def apply_moving_weighted_average(self,
                                      input_column,
                                      dest_column=None,
                                      row_range=(0, None),
                                      window=5,
                                      weights=[1, 2, 3, 4, 5]):
        '''
        Apply moving weighted average as another column

        :param input_column: Required column to add feature engineering
        :param dest_column: Destination column name
        :param row_range: Range of rows that need to modify
        :param window: Window size of the calculation takes part
        :param weights: list of integers
        :return: None
        '''
        if dest_column == None:
            dest_column = input_column + '_mwa_' + str(window)

        full_series = list(self._pd_frame[input_column])
        filtered_series = full_series[row_range[0]:row_range[1]]
        result = Features.moving_weighted_average(series=filtered_series,
                                                  window=window,
                                                  weights=weights,
                                                  default=True)
        full_series[row_range[0]:row_range[1]] = result
        self.add_column(column_name=dest_column, series=full_series)
    def apply_moving_median_centered_average(self,
                                             input_column,
                                             dest_column=None,
                                             row_range=(0, None),
                                             window=5,
                                             boundary=1):
        '''
        Apply moving median centered average as another column

        :param input_column: Required column to add feature engineering
        :param dest_column: Destination column name
        :param row_range: Range of rows that need to modify
        :param window: Window size of the calculation takes part
        :param boundary: number of values that need to be removed from both ends of the sorted window
        :return: None
        '''
        if dest_column == None:
            dest_column = input_column + '_mmca_' + str(window)

        full_series = list(self._pd_frame[input_column])
        filtered_series = full_series[row_range[0]:row_range[1]]
        result = Features.moving_median_centered_average(
            series=filtered_series,
            window=window,
            boundary=boundary,
            default=True)
        full_series[row_range[0]:row_range[1]] = result
        self.add_column(column_name=dest_column, series=full_series)
    def apply_moving_k_closest_average(self,
                                       input_column,
                                       dest_column=None,
                                       row_range=(0, None),
                                       window=5,
                                       kclosest=3):
        '''
        Apply moving k closest average as another column

        :param input_column: Required column to add feature engineering
        :param dest_column: Destination column name
        :param row_range: Range of rows that need to modify
        :param window: Window size of the calculation takes part
        :param kclosest: k number of closest values to the recent occurrence including itself
        :return: None
        '''
        if dest_column == None:
            dest_column = input_column + '_kca_' + str(window)

        full_series = list(self._pd_frame[input_column])
        filtered_series = full_series[row_range[0]:row_range[1]]
        result = Features.moving_k_closest_average(series=filtered_series,
                                                   window=window,
                                                   kclosest=kclosest,
                                                   default=True)
        full_series[row_range[0]:row_range[1]] = result
        self.add_column(column_name=dest_column, series=full_series)
Exemple #44
0
    def TagSentence(self, words, pos):
        if self.nTagged % 500 == 0:
            self.tagger.stdin.close()
            self.tagger.stdout.close()
            #self.tagger.kill()
            os.kill(self.tagger.pid, SIGTERM)       #Need to do this for python 2.4
            self.tagger.wait()
            self.GetTagger()

        features = []
        seq_features = []
        quotes = Features.GetQuotes(words)
        for i in range(len(words)):
            features = self.fe.Extract(words, pos, None, i, False) + [u'DOMAIN=Twitter']
            if quotes[i]:
                features.append(u"QUOTED")
            seq_features.append(" ".join(features))

        #print ("\t".join(seq_features) + "\n").encode('utf8')
        self.tagger.stdin.write(("\t".join(seq_features) + "\n").encode('utf8'))

        event_tags = []
        for i in range(len(words)):
            event_tags.append(self.tagger.stdout.readline().rstrip('\n').strip(' '))
        self.nTagged += 1
        return event_tags
Exemple #45
0
def getMatchingExpression(testExpr, renormalize=True):
    matchExprns = []
    if renormalize:
        testExpr = SymbolData.normalizeExprs([testExpr], 299)
    testExpr = testExpr[0]
    for trainData in trainDatas:
        if (trainData[2] == len(testExpr.symbols)):
            matchExprns.append(trainData)

    #for trainData in trainDatas:
    scoreSCC = NP.zeros((len(matchExprns)))
    #    scoreMI = NP.zeros((len(matchExprns)))
    k = 0
    testImg = Features.getImgExpr(testExpr)
    for exprList in matchExprns:
        scoreSCC[k] = scc(testImg, exprList[1])
        #    scoreMI[k] = MI(testData[1],exprList[1])
        k += 1
    indSCC = NP.argsort(scoreSCC).astype(int)
    scoreSCC = scoreSCC[indSCC]
    scoreSCC = scoreSCC / scoreSCC[-1]

    #    indMI = NP.argsort(scoreMI).astype(int)
    #    scoreMI = scoreMI[indMI]
    #    scoreMI = scoreMI/scoreMI[-1]

    #    matchExprSortSCC = []
    #    for i in indSCC:
    #        matchExprSortSCC.append(matchExprns[i])
    #
    #    return(matchExprSortSCC[-1])
    return (matchExprns[indSCC[-1]])
Exemple #46
0
def perceptron(data, maxIterations, featureSet):
    length = Features.getLength(featureSet)
    rate = 0.1
    weights = np.zeros((1,length+1))
    i = 0
    while i < maxIterations:
        for j in range(data.shape[0]):
            #print weights
            vector = Features.getVector(data[j,0], featureSet)
            vector.append(length)
            sign = predict_one(weights, vector, 0)
            if (data[j,1] == '+' and sign == -1) or (data[j,1] == '-' and sign == 1):
                for index in vector:
                    weights[0,index] = weights[0,index] - rate*sign
        i += 1
    return weights 
Exemple #47
0
    def suggest_moves(self, board):
        board_feature_planes = Features.make_feature_planes_stones_4liberties_4history_ko_4captures(
            board, board.color_to_play).astype(np.float32)
        Normalization.apply_featurewise_normalization_C(board_feature_planes)
        feed_dict = {
            self.feature_planes:
            board_feature_planes.reshape(1, self.model.N, self.model.N,
                                         self.model.Nfeat)
        }
        move_logits = self.sess.run(self.logits,
                                    feed_dict).ravel()  # ravel flattens to 1D
        # zero out illegal moves
        for x in xrange(self.model.N):
            for y in xrange(self.model.N):
                ind = self.model.N * x + y
                if not board.play_is_legal(x, y, board.color_to_play):
                    move_logits[ind] = -1e99
        move_probs = softmax(move_logits, self.softmax_temp)
        sum_probs = np.sum(move_probs)
        if sum_probs == 0: return []  # no legal moves
        move_probs /= sum_probs  # re-normalize probabilities

        good_moves = []
        cum_prob = 0.0
        while cum_prob < self.threshold_prob:
            ind = np.argmax(move_probs)
            x, y = ind / self.model.N, ind % self.model.N
            good_moves.append((x, y))
            prob = move_probs[ind]
            cum_prob += prob
            move_probs[ind] = 0

        return good_moves
def error_analyze(make_model, train_data, test_data, featurizer):
    matrices = Features.make_experiment_matrices(train_data, test_data, featurizer)
    model = make_model()
    model.fit(matrices['train_X'], matrices['train_Y'])
    bins = [v / 100.0 for v in range(50, 110, 5)]
    ext_preds = Models.extended_predict(model, matrices['test_X'], matrices['test_Y'])
    return Models.error_analysis(ext_preds, bins = bins)
Exemple #49
0
def main():
    beatscriptFile = open(sys.argv[1], "r")
    lines = beatscriptFile.readlines()
    context = readContext(lines)
    beatList = readBeatscript(lines, context)
    blockList = coalesceBeats(beatList)
    Features.initializeContextVars(context)
    dataLines = []
    for block in blockList:
        dataLines.append(createDataLine(context, block))
        context["BygoneBlocks"].append(block)

    outputFile = open(sys.argv[2], "w")
    for dataLine in dataLines:
        outputFile.write(DELIMITER.join([str(x) for x in dataLine]) + "\n")
        #outputFile.write(DELIMITER.join(dataLine) + "\n")
    outputFile.close()
Exemple #50
0
    def __init__(self, universe=None, temperature = 300):
	if universe == None:
	    return
        Features.checkFeatures(self, universe)
	self.universe = universe
        self.temperature = temperature
	self.sqrt_mass = Numeric.sqrt(self.universe.masses().array)
	self.sqrt_mass = self.sqrt_mass[:, Numeric.NewAxis]

	self._forceConstantMatrix()
	ev = self._diagonalize()

	self.imaginary = Numeric.less(ev, 0.)
	self.frequencies = Numeric.sqrt(Numeric.fabs(ev)) / (2.*Units.pi)
	self.sort_index = Numeric.argsort(self.frequencies)

	self._scale(temperature)
	del self.sqrt_mass
Exemple #51
0
def winnow(data, maxIterations, featureSet):
    length = Features.getLength(featureSet)
    threshold = length
    weights = np.ones((1,length))
    i = 0
    while i < maxIterations:
        for j in range(data.shape[0]):
            #print weights
            vector = Features.getVector(data[j,0], featureSet)
            sign = predict_one(weights, vector, threshold)
            if data[j,1] == '+' and sign == -1:
                for index in vector:
                    weights[0,index] = weights[0,index]*2
            elif data[j,1] == '-' and sign == 1:
                for index in vector:
                    weights[0,index] = weights[0,index]/2
        i += 1
    return weights
Exemple #52
0
def onlineFeatureLineCreator(filename, use_classified_shot = False, use_history = True):
    # load context and complete beatlist from file
    context, beatList = getContextAndBeatListFromFile(filename)
    Features.initializeContextVars(context)
    blockList = coalesceBeats(beatList)
    context["BygoneBlocks"] = []
    for block in blockList:
        shot_true = block[-1].shot
        # get current feature line and true shot class
        featureLine = getFeatureLine(context, block, True, -1)
        features = np.array(featureLine[:-1], dtype=np.float64)
        shot_classified = yield features, shot_true
        # update block and lastShotId
        if use_classified_shot:
            for beat in block:
                beat.shot = shot_classified
        if use_history:
            context["BygoneBlocks"].append(block)
    def timer_event(self):
        # get next window
        try:
            ## XXX MARIO
#            data = self.data.next()
            data = self.data.next_in_selection()
        # --> video finished
        except StopIteration:
            return

        y = data.points
        x = range(len(y))


        ## remove old line
        try:
            old_line = self.lines.popleft()

            ## XXX MARIO                        
            for l in old_line:
                self.ax.lines.remove(l)
        
        # --> no old lines, so we don't have to remove any
        except IndexError:
            pass
            
            
        ## * draw new line *
        line = self.ax.plot( x, y, self.line_type, c=data.color )
        self.lines.append(line)
        
        ## draw label
#        self.label = self.ax.text(0.02, 0.98, data.label,
#             horizontalalignment='left',
#             verticalalignment='top',
#             color=data.color,
#             transform = self.ax.transAxes)

        self.label.set_text(data.label)
        self.label.set_color(data.color)
        
        ## progress
        self.status_label.set_text(str(data.progress[0]) + " / " + str(data.progress[1])
             + ", " + data.label )

        
        print data.label, Features.calc_mean(y)
        plt.draw()
        
        
        ## XXX
        if ( data.label in ("up", "down")):
            time.sleep(2)
        
        ## set new timer
        self._set_timer(self.speed)
    def __call__(self, **options):
	self.setCallOptions(options)
	Features.checkFeatures(self, self.universe)
	configuration = self.universe.configuration()
	fixed = self.universe.getAtomBooleanArray('fixed')
        nt = self.getOption('threads')
	evaluator = self.universe.energyEvaluator(threads=nt).CEvaluator()
	args =(self.universe,
               configuration.array, fixed.array, evaluator,
               self.getOption('steps'), self.getOption('step_size'),
               self.getOption('convergence'), self.getActions(),
               'Conjugate gradient minimization with ' +
               self.optionString(['convergence', 'step_size', 'steps']))
        if self.getOption('background'):
            if not threading:
                raise OSError("background processing not available")
            return MinimizerThread(self.universe, conjugateGradient, args)
        else:
            apply(conjugateGradient, args)
Exemple #55
0
def count_unigrams(outpath):
    tw_cter, twitter_cts = Features.wordCountsSkLearn(Features.getX(tw), stop_words = 'english')
    blog_cter, blog_cts = Features.wordCountsSkLearn(Features.getX(blog), stop_words = 'english')

    # Total number of non-stop-word unigrams
    unigrams = set(tw_cter.vocabulary_.keys() + blog_cter.vocabulary_.keys())
    print "Data has %d distinct unigrams" % len(unigrams)

    # Distribution of unigram cts
    twitter_unigram_histo = histogram_cts(twitter_cts)
    blog_unigram_histo = histogram_cts(blog_cts)

    unigram_histo = histo_to_tuples(twitter_unigram_histo, 'twitter+wiki') + \
                    histo_to_tuples(blog_unigram_histo, 'blog')

    # Write out to csv
    with open(outpath, 'w') as unigram_histo_file:
        for elem in unigram_histo:
            unigram_histo_file.write("%s,%d,%f\n" % elem)
    return 0
Exemple #56
0
def main():
    arguments = validateInput(sys.argv)
    maxIterations, regularization, stepSize, lmbd, featureSet = arguments
    print maxIterations, regularization, stepSize, lmbd, featureSet

    trainData = readFile('train.csv')
    validationData = readFile('validation.csv')
    testData = readFile('test.csv')

    trainSize = trainData.shape[0]
    validationSize = validationData.shape[0]
    testSize = testData.shape[0]

    print "Number of training examples: " + str(trainSize)

    # Extract Features
    Features.extractFeatures(trainData[:,0], featureSet)
    print "Extracted Features:"
    if featureSet == 1 or featureSet == 3:
        print "Unigram: " + str(Features.getLength(1))
    if featureSet == 2 or featureSet == 3:
        print "Bigram: " + str(Features.getLength(2))

    # Construct Input Matrices X
    xTrain = Features.getMatrix(trainData[:,0], featureSet)
    print "Train Matrix built"

    xValidation = Features.getMatrix(validationData[:,0], featureSet)
    print "Validation Matrix built"

    xTest = Features.getMatrix(testData[:,0], featureSet)
    print "Test Matrix built"
    
    yTrain = extractLabel(trainData[:,1])
    yVailidation = extractLabel(validationData[:,1])
    yTest = extractLabel(testData[:,1])

    # Train the model
    theta = GD(xTrain, yTrain, trainSize, maxIterations, regularization, stepSize, lmbd, featureSet)
    print "Final Theta: " + str(theta)

    # Classify
    trainResult = predict(xTrain, trainSize, theta, featureSet)
    print "Train Result: " + str(trainResult)
    validationResult = predict(xValidation, validationSize, theta, featureSet)
    print "Validation Result: " + str(validationResult)
    testResult = predict(xTest, testSize, theta, featureSet)
    print "Test Result: " + str(testResult)

    # Performance
    print "\nPerformance on training data:"
    performance(trainResult, trainData[:,1])
    print "\nPerformance on validation data:"
    performance(validationResult, validationData[:,1])
    print "\nPerformance on test data:"
    performance(testResult, testData[:,1])
Exemple #57
0
    def pick_model_move(self, color):
        if self.model.Nfeat == 15:
            board_feature_planes = Features.make_feature_planes_stones_3liberties_4history_ko(self.board, color)
            Normalization.apply_featurewise_normalization_B(board_feature_planes)
        elif self.model.Nfeat == 21:
            board_feature_planes = Features.make_feature_planes_stones_4liberties_4history_ko_4captures(self.board, color).astype(np.float32)
            Normalization.apply_featurewise_normalization_C(board_feature_planes)
        else:
            assert False
        feature_batch = Symmetry.make_symmetry_batch(board_feature_planes)

        feed_dict = {self.feature_planes: feature_batch}

        logit_batch = self.sess.run(self.logits, feed_dict)
        move_logits = Symmetry.average_plane_over_symmetries(logit_batch, self.model.N)
        softmax_temp = 1.0
        move_probs = softmax(move_logits, softmax_temp)

        # zero out illegal moves
        for x in xrange(self.model.N):
            for y in xrange(self.model.N):
                ind = self.model.N * x + y 
                if not self.board.play_is_legal(x, y, color):
                    move_probs[ind] = 0
        sum_probs = np.sum(move_probs)
        if sum_probs == 0: return Move.Pass() # no legal moves, pass
        move_probs /= sum_probs # re-normalize probabilities

        pick_best = True
        if pick_best:
            move_ind = np.argmax(move_probs)
        else:
            move_ind = sample_from(move_probs)
        move_x = move_ind / self.model.N
        move_y = move_ind % self.model.N

        self.last_move_probs = move_probs.reshape((self.board.N, self.board.N))

        return Move(move_x, move_y)
Exemple #58
0
def getFeatureLine(context, block, shot, lastShotId, leave_out_class=None):
    """
    This function creates a featureLine. This is done by calculating getNumbers() for
    all Feature-Classes in Features.py and appending the desired class. A featureLine
    consists of several Numbers and a String at the end for the class.
    """
    line = []
    featureClassList = Features.getAllFeatureClasses()
    context = Features.createBeatList(context, block)
    # use all features except leave_out_class if given
    #for featureClass in [x for x in featureClassList if x != leave_out_class]:
    # use only features which contribute significant information
    for featureClass in [featureClassList[x] for x in range(len(featureClassList)) if
                         x in [9, 10, 11, 12, 13, 15, 19, 25, 27, 28, 30, 31, 32, 33, 37,
                               39, 41, 42]]:
        feature = featureClass(context, block)
        line += feature.getNumbers()
    if shot:
        line.append(SHOT_NAMES[block[0].shot])
    else:#is there a cut?
        line.append(str(lastShotId != block[0].shotId))
    return line
Exemple #59
0
 def get_position_eval(self):
     #assert self.model.Nfeat == 21
     #board_feature_planes = Features.make_feature_planes_stones_4liberties_4history_ko_4captures(self.board, self.board.color_to_play).astype(np.float32)
     #Normalization.apply_featurewise_normalization_C(board_feature_planes)
     assert self.model.Nfeat == 22
     board_feature_planes = Features.make_feature_planes_stones_4liberties_4history_ko_4captures_komi(self.board, self.board.color_to_play, self.komi).astype(np.float32)
     Normalization.apply_featurewise_normalization_D(board_feature_planes)
     feature_batch = Symmetry.make_symmetry_batch(board_feature_planes)
     feed_dict = {self.feature_planes: feature_batch}
     probs_batch = self.sess.run(self.probs_op, feed_dict)
     prob = average_probs_over_symmetries(probs_batch)
     if self.board.color_to_play == Color.White:
         prob *= -1
     return prob