Ejemplo n.º 1
0
def main(train_set, test_set, iter=2):
    split("yelp_cat.csv")
    iter = int(iter)
    X_train, X_test, Y_train, Y_test = load(train_set, test_set)
    p = perceptronAverage(iter, X_train, Y_train)
    p.train()
    print "ZERO-ONE LOSS=" + str(p.test(X_test,Y_test))
Ejemplo n.º 2
0
    def quadtree_score(self, table, level=0):
        # at this step, need to compute sample size
        # so that with 95% confidence, the margin
        # of error is within 10% of the true value.
        if level > self.max_levels: raise

        best_split, best_est, best_parts = None, None, []
        for split in self.possible_splits(table):
            _logger.debug("Checking Split\t%s\t%s", split.attr.name,
                          ','.join(map(str,map(len, split()))))
            partinfo = []
            for partition in split():
                if len(partition) == 0:
                    continue

                stats = self.evaluate(partition)
                partinfo.append( stats )

            split_est = self.evaluate_split(partinfo)

            if len(partinfo) > 1 and (not best_split or split_est < best_est):
                best_split = split
                best_est = split_est
                best_parts = partinfo

        if not best_split:
            raise
        
        _logger.info( "quadtree\t%d\t%d\t%s\t%f",
                      len(table),
                      level, 
                      best_split.attr.name,
                      best_est)

        for partition, stats in zip(best_split(table), best_parts):
            if self.should_stop(partition, stats ):
                _logger.info( "stopped on partition\t%d\test(%f)\tstd(%f)",
                              len(partition),
                              stats.est,
                              stats.std)
                
                for row in partition:
                    if row[self.SCORE_ID].value == -inf:
                        row[self.SCORE_ID] = stats.est
            else:
                self.errprob.append( min(1.0, self.errprob[-1] * len(table) / len(partition)) )
                self.quadtree_score(partition, level+1)
                self.errprob.pop()
Ejemplo n.º 3
0
 def begin(self, box1, box2):
     sign = signatures()
     spl = split()
     idx_sign = sign.get_signatures_triple(box1, box2)
     sort, in2sorted, sorted2in = sign.my_sort(idx_sign)
     tri_sign, tri_sign_i = sign.sort_signatures(box1, box2, in2sorted)
     split_sign = spl.split(tuple(sort), tri_sign, tri_sign_i)
     table = sign.ret_original_order(split_sign, sorted2in)
     return table
Ejemplo n.º 4
0
def recognize(path=constents.imgPath, description=""):
    print('验证码描述:' + description)
    mode = 5
    img = cv.imread(path)
    imgs = split(img, mode, line=1)

    if not os.path.exists("temp"):
        os.mkdir("temp")
    labels = []
    for idx, img in enumerate(imgs):
        cv.imwrite("temp/{}.png".format(idx), img)
        files = {'images': open("temp/{}.png".format(idx), 'rb')}
        try:
            r = requests.post(constents.url,
                              files=files,
                              headers=constents.headers)
            rr = json.loads(r.text)
            print(r.text)
            labels.append(rr["predicted_label"])
        except:
            labels.append('-')
    return "".join(labels)
Ejemplo n.º 5
0
def setup_world():
    global tris
    verts = 0
    tri = []

    f = open(os.path.join("data", "world.txt"))
    lines = f.readlines()
    f.close()
    for line in lines:
        vals = split(line)
        if len(vals) != 5:
            continue
        if vals[0] == '//':
            continue

        vertex = []
        for val in vals:
            vertex.append(float(val))
        tri.append(vertex)
        verts += 1
        if (verts == 3):
            tris.append(tri)
            tri = []
            verts = 0
Ejemplo n.º 6
0
#import numpy as np
#import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn import svm
#import os
from split import *
from clean import *
from cleaning import *
from sklearn.metrics import classification_report

train, test = cleanTrain()

trainFeatures, trainTarget = split(train)
digitizeMatrix(trainFeatures)

cvFeatures = trainFeatures[-6500000:]
trainFeatures = trainFeatures[:-6500000]

conf = []

N = len(trainTarget.columns)

for i in range(0,N):
    target1 = trainTarget.columns[i]

    cvTarget = trainTarget[target1][-6500000:]

    Target = trainTarget[target1][:-6500000]
Ejemplo n.º 7
0
def main():
    data1 = pd.read_csv('Admission_Predict.csv')
    data2 = pd.read_csv('Admission_Predict_Ver1.1.csv')

    data = pd.concat([data1, data2]).drop('Serial No.', axis=1)
    target = data['Chance of Admit ']
    features = data.drop('Chance of Admit ', axis=1)

    X_train, X_test, y_train, y_test = split(features, target)

    gb = model.base_model()
    pred = gb.predict(X_test)

    print("Our baseline model without tuning gave an R2 of {}".format(
        model.performance_metric(y_test, pred)))

    # Tune 1
    model.optimize(X_train,
                   y_train,
                   regressor=gb,
                   parameter={'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100]})
    visuals.plot_optimization(
        regressor=gb,
        parameter={'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100]})
    gb = gb.set_params(n_estimators=50)

    # Tune 2
    model.optimize(X_train,
                   y_train,
                   regressor=gb,
                   parameter={
                       'max_depth': range(2, 12, 2),
                       'min_samples_split': range(6, 18, 2)
                   })
    visuals.plot_optimization(regressor=gb,
                              parameter={'max_depth': range(2, 20, 2)})
    gb = gb.set_params(max_depth=10)

    # Tune 3
    model.optimize(X_train,
                   y_train,
                   regressor=gb,
                   parameter={
                       'min_samples_split': range(6, 18, 2),
                       'min_samples_leaf': [3, 5, 7, 9, 12, 15]
                   })
    visuals.plot_optimization(regressor=gb,
                              parameter={'min_samples_split': range(6, 18, 2)})
    gb = gb.set_params(min_samples_split=6)
    visuals.plot_optimization(
        regressor=gb, parameter={'min_samples_leaf': [3, 5, 7, 9, 12, 15]})
    gb = gb.set_params(min_samples_leaf=3)

    # Tune 4
    model.optimize(X_train,
                   y_train,
                   regressor=gb,
                   parameter={'max_features': range(1, 8)})
    visuals.plot_optimization(regressor=gb,
                              parameter={'max_features': range(1, 8)})
    gb = gb.set_params(max_features=3)

    # Tune 5
    model.optimize(X_train,
                   y_train,
                   regressor=gb,
                   parameter={'subsample': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]})
    visuals.plot_optimization(
        regressor=gb,
        parameter={'subsample': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]})
    gb = gb.set_params(subsample=0.95)

    # Tune 6
    model.robust_model(gb,
                       rates=[0.05, 0.01, 0.005, 0.005],
                       trees=[100, 500, 1000, 1500])
    gb = gb.set_params(learning_rate=0.005, n_estimators=1500)

    gb = gb.fit(X_train, y_train)

    pred = gb.predict(X_test)
    print('Rsquare score of {}'.format(
        np.round(model.performance_metric(y_test, pred), decimals=5)))

    visuals.feature_importance(features.columns, gb.feature_importances_)
Ejemplo n.º 8
0
def invokeSDD(testFile, preference='RANDOM', ddmin=False):
    # import ipdb; ipdb.set_trace()

    global numberOfUnresolvedTests
    global numberOfTotalTests
    numberOfUnresolvedTests = 0
    numberOfTotalTests = 0

    QR = getQueryResult("clearAllLabels(L)")
    print "TOTAL NODES: %s" % str(len(QR['L']))
    copy(testFile, currentMinimalFileName)

    searchHeuristic = None
    if preference=='TOP':
        searchHeuristic = "topScoringRemovableWUD(X)"
    elif preference=='BOTTOM':
        searchHeuristic = "topScoringRemovableWUD2(X)"
    elif preference == 'RANDOM':
        searchHeuristic = "topScoringRemovableWUDR(X)"
    elif preference == 'AVERAGE':
        searchHeuristic = "topScoringRemovableWUDA(X)"

    if ddmin:
        getQueryResult("markAllUntrackedDependencies(L)")

    t0 = time.time()
    while (getQueryResult("allRemovableWUD(L)")):
        copy(currentMinimalFileName, tentativeMinimalFileName)

        QR = getQueryResult(searchHeuristic)
        if QR is None or isVariableNone(QR['X']):
            break
        symbolToRemove = getValueFromAtom(QR['X'])
        # if symbolToRemove == 'sym0':
        #     import ipdb; ipdb.set_trace()

        currentDeletionSet = removeNodeTransitively(tentativeMinimalFileName,
                                                    symbolToRemove)
        result = runTest(commandName, tentativeMinimalFileName)
        markNodes(result, symbolToRemove)

        # import ipdb; ipdb.set_trace()
        if result == 'FAIL':
            copy(tentativeMinimalFileName, currentMinimalFileName)
        # else:
            # recursivelyDescend2(symbolToRemove, currentDeletionSet, result)

    t1 = time.time() - t0
    # Now run ddmin on nodes with untracked dependencies
    # QR = getQueryResult("allUntrackedDependencies(L)")
    QR = getQueryResult("allNotPermanentlyDeleted(L)")
    n = 2
    L = []
    # import ipdb; ipdb.set_trace()
    if not(QR is None or isVariableNone(QR['L'])):
        L = map(getValueFromAtom, QR['L'])

    # print L
    if not ddmin:
        print "PHASE 1: %s" % str(len(L))
        print "PHASE 1 TIME: %s" % str(t1)
        print "PHASE 1 UNRESOLVED: %d" % numberOfUnresolvedTests
        print "PHASE 1 TOTAL: %d\n" % numberOfTotalTests

    # if not ddmin:
    #     n = len(L)
    copy(currentMinimalFileName, tentativeMinimalFileName)
    while len(L) >= 2:
        # print L
        subsets = split(L, n)
        
        some_complement_is_failing = False
        for subset in subsets:
            complement = listminus(L, subset)
            removeNodeList(tentativeMinimalFileName, subset)
            result = runTest(commandName, tentativeMinimalFileName)
            if result == 'FAIL':
                copy(tentativeMinimalFileName, currentMinimalFileName)
                L = complement
                n = max(n-1, 2)
                some_complement_is_failing = True
                break
            else:
                copy(currentMinimalFileName, tentativeMinimalFileName)

        if not some_complement_is_failing:
            if n == len(L):
                break
            n = min(n * 2, len(L))
        
    # FIXME:HACK
    # import ipdb; ipdb.set_trace()
    print "MINIMAL CASE: %s" % str(len(L))
    print "NUMBEROFUNRESOLVEDTESTS: %d" % numberOfUnresolvedTests
    print "TOTALTESTS: %d\n===============================\n" % numberOfTotalTests
    # print L
    move(currentMinimalFileName, tentativeMinimalFileName)
    with open(tentativeMinimalFileName) as ifile:
        with open(currentMinimalFileName, 'w') as ofile:
            for line in ifile:
                lineStrip = line.strip()
                if lineStrip == '' or lineStrip == ';':
                    continue
                ofile.write(line)
Ejemplo n.º 9
0
    up = seqrange[1]
    if down<=rs and rs+sl-1<=up:
      return False
  return True

def hasRepeatseq(seqlist,rs,sl):
  for seqrange in seqlist:
    down = seqrange[0]
    up = seqrange[1]
    if down<=rs and up>=rs:
      return True
    elif down>rs and rs+sl-1>=down:
      return True
  return False
try:
  flownamelist = split(sys.argv[1])
  #flownamelist = sp[0]
  outlog = open(sys.argv[1].split('.',1)[0]+".log","w")

  for flowname in flownamelist:
    #try:
    #print flowname
    process(readflow(flowname),flowname)
  
    #except:
      #print 'Error occured in '+flowname
  
  outlog.close()
except:
  print 'Error! please check the input'
  print 'Usage: python tcpla.py filename.pcap'
Ejemplo n.º 10
0
from split import *
from planck import *
import numpy as np
import matplotlib.pyplot as plt
from astropy import constants as const
from astropy import units as u


#Cargamos los datos
T = np.loadtxt("sun_AM0.dat")

#La funcion split ordena los datos en una matriz
#Es un poco estupido pero es por una cosa de comodidad
D = split(T)

#El tamano de esta matriz
n = D.shape

#integracion numerica de los datos

I = 0
for j in range(0, n[1]-1):
    gap = D[0, j+1] - D[0, j]
    trap = ((D[1, j] + D[1, j+1]) * gap/2)
    I += trap

#integracion numerica de la funcion de Planck
#No podemos partir el calculo de la integral desde cero
#ya que la funcion se indefine en este valor
ini = 0.000001
fin = np.pi/2
Ejemplo n.º 11
0
    def quadtree_score(self, table, prev_splits=None):
        prev_splits = prev_splits or []
        if len(prev_splits) > self.max_levels: raise

        samples = self.get_samples(table)

        # evaluate current partition using sample
        cur_stats = self.evaluate(samples)
        should_stop = self.should_stop(samples, cur_stats)
        _logger.info("Stats: %s\tpop(%d)\tsamp(%d)\t%f-%f\t%f-%f",
                     should_stop,
                     len(table),
                     len(samples),
                     cur_stats.est-2.58*cur_stats.std,
                     cur_stats.est+2.58*cur_stats.std,
                     min(cur_stats.vals),
                     max(cur_stats.vals))
        if should_stop:
            for row in table:
                if row[self.SCORE_ID].value == -inf:
                    row[self.SCORE_ID] = cur_stats.est
            return

        # ok find the best split
        best_split, best_est, best_stats = None, None, []
        for split in self.possible_splits(samples):
            _logger.debug("Checking Split\t%s", str(split))
            
            stats_list = []
            for partition in split(samples):
                if len(partition) == 0:
                    continue

                stats = self.evaluate(partition)
                stats_list.append( stats )

            split_est = self.evaluate_split(stats_list)
            
            if len(stats_list) > 1 and (not best_split or  split_est < best_est):
                best_split = split
                best_est = split_est
                best_stats = stats_list

        if not best_split:
            raise

        _logger.info("Splitting on %s\t%s", best_split.attr.name,
                      ','.join(map(str,map(len, best_split(table)))))
        

        for partition in best_split(table):
            prev_splits.append(best_split)
            self.errprob.append( min(1.0, self.errprob[-1] * len(table) / len(partition)) )
            self.quadtree_score(partition, prev_splits=prev_splits)
            prev_splits.pop()
            self.errprob.pop()

        # sanity check, sum(partitions) == table
        part_sizes = map(len, best_split(table))
        total_part_size = sum( part_sizes )
        msg = "Partition sizes wrong: %d!=%d\t%s\t%s"
        msg = msg % (total_part_size,
                     len(table),
                     str(part_sizes),
                     str(best_split))
        assert total_part_size == len(table), msg