class TrainTestSets(object):

    def __init__(self, *args):
        self.train = DataSet()
        self.test = DataSet()
        self.setOptions(args)

    def getTrainingSet(self):
        return self.train

    def getTestingSet(self):
        return self.test

    def setTrainingSet(self, inTrain):
        self.train = inTrain

    def setTestingSet(self, inTest):
        self.test = inTest

    def setOptions(self, arguments):
        newDataSet = DataSet()
        for num in range(0, len(arguments)):
            if arguments[num] == "-t":
                newDataSet.load(arguments[num+1])
                self.setTestingSet(newDataSet)
                break
            elif arguments[num] == "-T":
                newDataSet.load(arguments[num+1])
                self.setTrainingSet(newDataSet)
                break

    def __print__(self):
        self.train.__print__()
        self.test.__print__()
예제 #2
0
 def __init__(self, sess):
     self.sess = sess
     self.CATEGORY_NUM = 151
     self.IMAGE_SIZE = [224, 224]
     self.IMAGE_CHANNEL = 3
     data_dir = '/home/give/Documents/dataset/ADEChallengeData2016'
     self.dataset = DataSet(data_dir)
     self.learning_rate = 1e-5
     self.itertator_number = int(1e+5)
     self.BATCH_SIZE = 80
     self.imgs = tf.placeholder(
         tf.float32,
         shape=[
             self.BATCH_SIZE,
             self.IMAGE_SIZE[0],
             self.IMAGE_SIZE[0],
             self.IMAGE_CHANNEL
         ]
     )
     self.y_ = tf.placeholder(
         tf.float32,
         shape=[
             self.BATCH_SIZE,
             self.IMAGE_SIZE[0],
             self.IMAGE_SIZE[1],
             self.CATEGORY_NUM
         ]
     )
     self.vgg = vgg16(self.imgs, self.sess, skip_layers=['fc6', 'fc7', 'fc8'])
     self.inference()
예제 #3
0
파일: Test.py 프로젝트: ab2xyz/HSP
    def ReadCSV_OneChannel(self,iChannel):
        setTest=DataSet(homeCSV=self.homeCSV,listCSV=self.listCSV4Test,labels=self.labels,numClasses=self.numClasses,branch4Train=self.branch4Train,resize=self.resize,numProcess=0)

        data=None
        label=None
        uid=None

        counterCSV=0
        for iCSV in tqdm(self.listCSV4Test[iChannel]):
            counterCSV+=1
            if (self.numFilesCut>0) and (counterCSV>self.numFilesCut):
                continue

            iReadCSV=self.homeCSV+iChannel+'/'+iCSV
            iReadClass=self.labels[iChannel]
            iData,iLabel,iUid, iClass=setTest.ReadCSV_OneFile(iClass=iReadClass, iCSV=iReadCSV)

            if data is None:
                data=iData
                label=iLabel
                uid=iUid
            else:
                data=np.r_[data,iData]
                label=np.r_[label,iLabel]
                uid=np.r_[uid,iUid]


        setTest.SetDataLabel(data=data,label=label,uid=uid)


        return (setTest, iChannel)
class Data:
    training = None
    testing = None

    def __init__(self, path, featureCols, labelCol, trainSize,preProc=False, addbias =False):
        '''
        path is absolute
        path is a csv file
        '''
        self.data = pd.read_csv(filepath_or_buffer=path, usecols=featureCols + labelCol)
        self.data = self.data.sample(frac=1).reset_index(drop=True)                            # frac=1 is taking all the dataset
        if preProc:
            self.features = self.data[featureCols].to_numpy(dtype='float64')
            self.labels = self.data[labelCol].to_numpy(dtype='float64')
        else:
            self.features = self.data[featureCols].to_numpy()
            self.labels = self.data[labelCol].to_numpy()
        self.labels = self.labels.flatten()
        self.n = self.features.shape[1] + 1 if addbias else 0 # Number of features with bias included
        self.m = self.features.shape[0]
        self.training = DataSet(self.features[:int(trainSize * self.m)], self.labels[:int(trainSize * self.m)], preProc,addbias)
        self.testing = DataSet(self.features[int(trainSize * self.m):],self.labels[int(trainSize * self.m):], False, addbias)
        
        if preProc:
            self.testing.applyNormalization(self.training.normFeatures, self.training.normLabels)
def read_img_sets(image_dir, image_size, validation_size=0):
    class DataSets:
        pass

    data_sets = DataSets()

    images, labels, ids, cls, cls_map = load_data(image_dir, image_size)

    images, labels, ids, cls = shuffle(images, labels, ids, cls)

    if isinstance(validation_size, float):
        validation_size = int(validation_size * images.shape[0])

    test_images = images[:validation_size]
    test_labels = labels[:validation_size]
    test_ids = ids[:validation_size]
    test_cls = cls[:validation_size]

    train_images = images[validation_size:]
    train_labels = labels[validation_size:]
    train_ids = ids[validation_size:]
    train_cls = cls[validation_size:]

    data_sets.train = DataSet(train_images, train_labels, train_ids, train_cls)
    data_sets.test = DataSet(test_images, test_labels, test_ids, test_cls)

    return data_sets, cls_map
def init_isbi2012_train():
    isbi2012_train = DataSet(meta_folder, "isbi2012_train")

    raw_path = os.path.join(data_path, "raw/train-volume.h5")
    raw_key = "data"
    # nasims baseline prob map
    inp_path = os.path.join(
        data_path, "probabilities/old_probs/nasims_oldbaseline_train.h5")
    inp_key = "exported_data"

    isbi2012_train.add_raw(raw_path, raw_key)
    isbi2012_train.add_input(inp_path, inp_key)

    # 2d wsdt on namsis pmap
    seg_path0 = os.path.join(
        data_path, "watersheds/old_watersheds/ws_dt_nasims_baseline_train.h5")
    seg_key = "superpixel"

    isbi2012_train.add_seg(seg_path0, seg_key)

    # layerwise gt
    gt_path = os.path.join(data_path, "groundtruth/gt_cleaned.h5")
    isbi2012_train.add_gt(gt_path, "data")

    meta.add_dataset("isbi2012_train", isbi2012_train)
예제 #7
0
    def __init__(self, args):
        self.dataName = args.dataName
        self.dataSet = DataSet(self.dataName)
        self.shape = self.dataSet.shape
        self.maxRate = self.dataSet.maxRate

        self.train = self.dataSet.train
        self.test = self.dataSet.test

        self.negNum = args.negNum
        self.testNeg = self.dataSet.getTestNeg(self.test, 99)
        self.add_embedding_matrix()

        self.add_placeholders()

        self.userLayer = args.userLayer
        self.itemLayer = args.itemLayer
        self.add_model()

        self.add_loss()

        self.lr = args.lr
        self.add_train_step()

        self.checkPoint = args.checkPoint
        self.init_sess()

        self.maxEpochs = args.maxEpochs
        self.batchSize = args.batchSize

        self.topK = args.topK
        self.earlyStop = args.earlyStop
예제 #8
0
 def setUp(self):
     self.DataSet = DataSet("./data/199801.txt.tmp")
     self.DataSet.load_dict()
     self.hashindexer = HashIndexer(self.DataSet)
     self.hashindexer.build_indexer()
     self.trieindexer = TrieIndexer(self.DataSet)
     self.trieindexer.build_indexer()
예제 #9
0
    def __init__(self, args, density):

        self.dataset = DataSet(args.dataType, density)
        self.dataType = self.dataset.dataType
        self.density = self.dataset.density
        self.shape = self.dataset.shape

        self.train = self.dataset.train
        self.test = self.dataset.test

        self.epochNum = args.epochNum
        self.batchSize = args.batchSize
        self.layers = args.layers
        self.regLayers = args.regLayers
        self.lr = args.lr
        self.decay = args.decay
        self.optimizer = args.optimizer
        self.verbose = args.verbose

        self.store = args.store
        self.modelPath = args.modelPath
        self.resultPath = args.resultPath

        self.model = self.compile_model()

        self.run()
예제 #10
0
def read_data_sets(data_dir):
    filename = "cifar-100-python.tar.gz"
    print("getting data")
    SOURCE_URL = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'

    local_file = base.maybe_download(filename, data_dir, SOURCE_URL)

    print('Extracting', filename)
    train_images, train_labels = [], []
    test_images, test_labels = [], []
    with gfile.Open(data_dir + "/" + filename,
                    'rb') as f, tarfile.open(fileobj=f) as tar:
        for x in tar.getnames():
            if "data_batch" in x:
                i, l = _get_data(tar.extractfile(x))
                train_images.extend(i.reshape((i.shape[0], 32, 32, 3)))
                train_labels.extend(l)
            if "test_batch" in x:
                i, l = _get_data(tar.extractfile(x))
                test_images.extend(i.reshape((i.shape[0], 32, 32, 3)))
                test_labels.extend(l)

    train_images = np.array(train_images)
    test_images = np.array(test_images)
    train_labels = np.array(train_labels)
    test_labels = np.array(test_labels)

    train = DataSet(train_images, train_labels, dtype=dtypes.uint8, depth=100)
    test = DataSet(test_images, test_labels, dtype=dtypes.uint8, depth=100)

    return base.Datasets(train=train, validation=None, test=test)
예제 #11
0
 def _create_trees(self, X_train, y_train):
     dataset = DataSet(X_train, y_train)
     self._trees = [
         DecisionTree(dataset.subset_from_ratio(self.ratio_samples),
                      self.coefficient, self.values, self._max_depth)
         for _ in range(self._num_trees)
     ]
예제 #12
0
	def parse(self, dsId, maxResults=0):
		job = self.gate.getDs(dsId, maxResults)

		if not job:
			raise TypeError('Parser received illegal dataset ID')

		job = job[0]
	
		#get data as a list of rows instead of columns	
		colData = map(None, *job["data"] )
		
		dataset = DataSet( job["title"].encode('utf-8'), job["id"].encode('utf-8'), 
					job["columns"][0]["time_granularity"].encode('utf-8'))
		

		for col, item in enumerate(colData):
			
			#if type(colData[0]) != tuple:
			 # raise TypeError('Data is not of type: Tuple')
			  
			#else:
			  tl = TimeLine( job["columns"][col]["title"].encode('utf-8'),
					  job["columns"][col]["cid"].encode('utf-8'),
					  colData[col] )
			  dataset.append(tl)
		
		return dataset				 
예제 #13
0
    def readFile(self, file):
        ds = DataSet([])
        names = file.readline()
        names = names[1:]
        curName = ""

        # for all char's if it is a ";" create a node with the name up until the, else add the char to the name
        for char in names:
            if char == ";" or char == "\n":
                node = Node(curName, [])
                ds.addNote(node)
                curName = ""
            else:
                curName = curName + char

        numNode = 0
        nodes = ds.getNodes()

        # skips the first line because of 8
        for line in file:
            # remove everything up to and including the first ";"
            line = line[len(nodes[numNode].getName()) + 1:]
            count = 0
            i = 0
            while i < len(line) - 1:
                if int(line[i]) != 0:
                    nodes[numNode].addLink([nodes[count], int(line[i])])
                count += 1
                i += 2
            numNode += 1
        return ds
예제 #14
0
    def get_data_loader(self, train_path_list, valid_path_list,
                        test_path_list):

        train_data_set = DataSet(train_path_list)
        valid_data_set = DataSet(valid_path_list)
        test_data_set = DataSet(test_path_list)

        train_data_loader = DataLoader(
            train_data_set,
            pin_memory=True,
            batch_size=self.h_params.train.batch_size,
            shuffle=True,
            num_workers=self.h_params.resource.num_workers,
            drop_last=True)
        valid_data_loader = DataLoader(
            valid_data_set,
            pin_memory=True,
            batch_size=self.h_params.train.batch_size,
            shuffle=False,
            num_workers=self.h_params.resource.num_workers,
            drop_last=True)
        test_data_loader = DataLoader(
            test_data_set,
            batch_size=self.h_params.train.batch_size,
            shuffle=False,
            num_workers=self.h_params.resource.num_workers,
            drop_last=False)

        return train_data_loader, valid_data_loader, test_data_loader
예제 #15
0
    def __init__(self, args, density):

        self.dataSet = DataSet(args, density)
        self.dataType = self.dataSet.dataType
        self.density = self.dataSet.density
        self.shape = self.dataSet.shape

        self.train = self.dataSet.train
        self.test = self.dataSet.test

        self.epochNum = args.epochNum
        self.batchSize = args.batchSize
        self.gruLayers = args.gruLayers
        self.gtfLayers = args.gtfLayers
        self.regLayers = args.regLayers
        self.dropLayers = args.dropLayers
        self.lr = args.lr
        self.decay = args.decay
        self.optimizer = args.optimizer
        self.verbose = args.verbose

        self.preTraining = args.preTraining
        self.store = args.store
        self.modelPath = args.modelPath
        self.imagePath = args.imagePath
        self.resultPath = args.resultPath

        self.model = self.load_model()

        self.run()
예제 #16
0
class OType:
    def __init__(self, topics_n, id):
        self.id = id
        self.dSet = DataSet(O_TYPES_STR[id])
        self.N = topics_n
        self.topics = []

        for i in range(self.N):
            self.topics.append(Topic(i))

    def add_item(self, probabilities, cand):
        #officer = db.officer[line]
        #if officer == 1:
        #    y = 2

        self.dSet.add_item(cand)
        for i in range(self.N):
            self.topics[i].add_item(probabilities[i], cand)

    def print_topics(self):
        output = "Officer Type {}\n".format(O_TYPES_STR[self.id])

        output = "{}\n{}\n".format(output, self.dSet.print_data())
        for i in range(self.N):
            output = "{}{}\n".format(output, self.topics[i].print_datasets())

        return output
예제 #17
0
파일: fracPlot.py 프로젝트: yetisir/Up-Frac
    def __init__(self, plotName, fileName=None, dataClass=None, showPlots=True):
        DataSet.__init__(self, fileName=fileName, dataClass=dataClass)

        print('-'*70)
        print('Establishing {} Plot'.format(plotName))
        print('-'*70)
        
        self.plotName = plotName
        
        #TODO: fix this to acocmodate non-colorbar plots
        self.figure = plt.figure(figsize=(6,5))
        self.axes = self.figure.add_axes([0.1, 0.1, 0.825*5/6, 0.825])
        self.colorBarAxes = self.figure.add_axes([0.825, 0.1, 0.05, 0.825])
        
        self.animationImages = [[] for _ in range(len(self.blockData.keys()))]
        
        if showPlots != True:
            matplotlib.use('Agg')

        time = min(self.blockData.keys())
        self.blocks = self.blockData[time].keys()
        self.zones = self.zoneData[time].keys()
        self.corners = self.cornerData[time].keys()
        self.contacts = self.contactData[time].keys()
        self.gridPoints = self.gridPointData[time].keys()
        self.domains = self.domainData[time].keys()
예제 #18
0
    def __init__(self, topics_n, id):
        self.id = id
        self.dSet = DataSet(O_TYPES_STR[id])
        self.N = topics_n
        self.topics = []

        for i in range(self.N):
            self.topics.append(Topic(i))
예제 #19
0
 def setOptions(self, arguments):
     for num in range(0, len(arguments[0])):
         if arguments[0][num] == "-k":
             self.k = int(arguments[0][num+1])
         elif arguments[0][num] == "-t":
             newDataSet = DataSet()
             newDataSet.load(arguments[0][num+1])
             self.instances = newDataSet
예제 #20
0
def deriveGeneStructure(all_gene_file):
    """
    copy of structures/main.py
    find out how to load object from another script
    """

    # might need to switch directory to structures

    from DataSet import DataSet
    from GeneFamily import GeneFamily
    """Create all Gene objects"""
    data = DataSet()

    for l in all_gene_file[1:]:
        data.addGene(l.split())
    """Create a list of Species"""
    def generateSpeciesDict():
        for g in data.genesDict:
            currentSpecies = data.genesDict[g].species
            if currentSpecies not in data.speciesDict:
                data.addSpecies(currentSpecies)

    generateSpeciesDict()
    """Make Family Dictionary"""

    def generateFamilyDict():
        for g in data.genesDict:
            currentGene = data.genesDict[g]

            if currentGene.family not in data.familiesDict:
                currentFamily = GeneFamily(currentGene.family,
                                           data.speciesDict)
                data.addFamily(currentFamily.familyName, currentFamily)
            else:
                currentFamily = data.familiesDict[currentGene.family]
            currentFamily.addToFamily(currentGene)

    generateFamilyDict()
    """Make a dictionary which gives information about gene order """

    geneOrder = {}
    for s in data.speciesDict:
        geneOrder[s] = {}

    for g in data.genesDict:
        currentGene = data.genesDict[g]
        try:
            geneOrder[currentGene.species][currentGene.ctg].append(
                currentGene)  #pass gene object
        except KeyError:
            geneOrder[currentGene.species][currentGene.ctg] = [currentGene]

    #sort genes in their contig
    for s in geneOrder:
        for c in geneOrder[s]:
            geneOrder[s][c].sort(key=lambda gene: int(gene.start))

    return data, geneOrder
예제 #21
0
 def __init__(self , point_len , dense = True ):
     self.weights = [1.] * point_len
     self.data = DataSet(point_len , dense)
     for i in range(point_len):
         self.data.append()
     self._keys = xrange(point_len)
     self._len = point_len
     self.outs_counter = Counter()
     self.point_ins = defaultdict(set)
예제 #22
0
def discretization_test(raw, filename):
    with open(filename, "w") as f:
        f.write("n;precision;recall;accuracy;f1\n")
        dataset = DataSet(raw)
        for i in [5, 7, 10, 15, 20, 40]:
            dataset.discretize_values(i)
            res = cross_validation(dataset)
            res = [str(i)] + [str(a) for a in res]
            f.write(";".join(res) + "\n")
예제 #23
0
 def setUp(self):
     self.data_set = DataSet("../input")
     self.data_set.load_train(
         os.path.join(
             self.data_set.cache_dir,
             "segment_numseg23_target@AB@CD@E@_ratio0.2_rand0_TRAIN.csv"))
     self.data_set.load_test(
         os.path.join(
             self.data_set.cache_dir,
             "segment_numseg23_target@AB@CD@E@_ratio0.2_rand0_TEST.csv"))
예제 #24
0
 def __init__(self, centre, radius, dataClass=None, fileName=None, ):
     DataSet.__init__(self, dataClass=dataClass, fileName=fileName)
     
     self.centre = centre
     self.radius = radius
     
     self.singleBlock = False
     if len(self.contactData) == 0:
         self.singleBlock = True
         
     self.calculateHomogenizationParameters()
def composer(dict_data, sub_index, offset = 0):
    x = dict_data['train'].data[sub_index['train'], :] + offset;
    y = dict_data['train'].labels[sub_index['train'], :];
    train = DataSet(x, y, onehot = False);
    x = dict_data['validation'].data[sub_index['validation'], :] + offset;
    y = dict_data['validation'].labels[sub_index['validation'], :];
    validation = DataSet(x, y, onehot = False);
    x = dict_data['test'].data[sub_index['test'], :] + offset;
    y = dict_data['test'].labels[sub_index['test'], :];
    test = DataSet(x, y, onehot = False);
    return {'train':train, 'validation':validation, 'test':test};
예제 #26
0
파일: PointSet.py 프로젝트: pxm321/raven
 def __init__(self):
     """
   Constructor.
   @ In, None
   @ Out, None
 """
     DataSet.__init__(self)
     self.name = 'PointSet'
     self.type = 'PointSet'
     self.printTag = self.name
     self._neededForReload = []  # PointSet doesn't need anything to reload
예제 #27
0
 def _readMoreXML(self, xmlNode):
     """
   Initializes data object based on XML input.
   @ In, xmlNode, xml.etree.ElementTree.Element or InputData.ParameterInput, input specification
   @ Out, None
 """
     DataSet._readMoreXML(self, xmlNode)
     # default to taking last point if no other spec was used
     # TODO throw a warning here, once we figure out how to give message handler in all cases
     if self._selectInput is None:
         self._selectInput = ('inputRow', -1)
예제 #28
0
 def setOptions(self, arguments):
     newDataSet = DataSet()
     for num in range(0, len(arguments)):
         if arguments[num] == "-t":
             newDataSet.load(arguments[num+1])
             self.setTestingSet(newDataSet)
             break
         elif arguments[num] == "-T":
             newDataSet.load(arguments[num+1])
             self.setTrainingSet(newDataSet)
             break
예제 #29
0
class DataSetTester(unittest.TestCase):

	def setUp(self):
		self.dataSet = DataSet("Atvinnuleysi", "v28", "Year")
		self.timeline = TimeLine("Title1", "id1")

	def testInsertItem(self):
		self.dataSet.append(self.timeline)
		self.dataSet.append(self.timeline)

		self.assertEqual(len(self.dataSet), 2)
예제 #30
0
def test_data_to_records():
    from DataSet import DataSet

    d = DataSet(FILE='demo/sample.csv')
    r = None
    for row in d.iterate():
        r = row
        break

    for row in d.to_records():
        assert r[1] == row[1].split(';')
        break
예제 #31
0
 def __init__(self):
     """
   Constructor.
   @ In, None
   @ Out, None
 """
     DataSet.__init__(self)
     self.name = 'HistorySet'
     self.type = 'HistorySet'
     self.printTag = self.name
     self._tempPivotParam = None
     self._neededForReload = [
     ]  # HistorySet doesn't need anything special to load, since it's written in cluster-by-sample CSV format
예제 #32
0
 def __init__(self):
     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
     self._set_args()
     self.args.dim = 6144
     self.args.disLayer = [6144, 6144]
     self.args.geneLayer = [6144, 6144]
     self.args.disLayer_s = [4096, 4096]
     self.args.geneLayer_s = [4096, 4096]
     self.args.maxEpochs = 100
     self.args.negNum = 50
     self.args.l2_weight = 1e-5
     self.data_set = DataSet()
     self.train()
예제 #33
0
def rules_generated_test(raw, filename):
    with open(filename, "w") as f:
        f.write("n;rules\n")
        dataset = DataSet(raw)
        for i in [3, 4, 5, 6, 7, 8, 9, 10]:
            dataset.discretize_values(i)
            results = []
            for j in range(3):  # liczba prób
                values = dataset.cross_validation(5)
                for v in values:
                    bayes = ILA(v["train"])
                    results.append(len(bayes.rules))
            f.write(str(i) + ";" + str(mean(results)) + "\n")
예제 #34
0
파일: rnn.py 프로젝트: harry771/MLDS2017
def run_test():
    dictionary = DP.read_dict(dict_file)
    raw_test, choices = DP.read_test(test_file, choices_file)
    test = DataSet(raw_test, len(dictionary), cut=False)

    # RNN Parameters
    N_input = test.datalen
    N_class = len(dictionary)
    N_iter = N_epoch * N_input

    # Input
    x = tf.placeholder(dtype=tf.int32, shape=[batch_size, None])
    y = tf.placeholder(dtype=tf.int32, shape=[batch_size, None])

    embeddings = tf.Variable(tf.random_uniform([N_class, N_hidden], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, x)

    y_reshape = tf.reshape(y, [-1])

    # Weights
    w = tf.Variable(tf.random_normal([N_hidden, N_class]))
    b = tf.Variable(tf.random_normal([N_class]))

    # RNN
    pred = RNN(embed, w, b)

    # accuracy
    correct_pred = tf.equal(tf.argmax(pred, 1), tf.cast(y_reshape, tf.int64))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    init = tf.global_variables_initializer()

    ans = []

    #    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    with tf.Session() as sess:
        sess.run(init)
        saver = tf.train.Saver()
        saver.restore(sess, model_file)

        for i in range(N_input):
            batch_x, _ = test.next_batch(batch_size=1)

            spaceID = np.argwhere(batch_x[0] == SPACE)[0, 0]

            prob = sess.run(pred, feed_dict={x: batch})

            best_choice = np.argmax(prob[spaceID - 1, choices[i]])
            ans.append(best_choice)

    return np.array(ans)
예제 #35
0
def init_testds():
    ds = DataSet("/home/consti/Work/data_neuro/cache/testds", "ds_test")

    raw_path = "/home/consti/Work/data_neuro/test_block/test-raw.h5"
    seg_path = "/home/consti/Work/data_neuro/test_block/test-seg.h5"
    prob_path = "/home/consti/Work/data_neuro/test_block/test-probs.h5"
    gt_path = "/home/consti/Work/data_neuro/test_block/test-gt.h5"

    ds.add_raw(raw_path, "data")
    ds.add_input(prob_path, "data")
    ds.add_seg(seg_path, "data")
    ds.add_gt(gt_path, "data")

    meta.add_dataset("ds_test", ds_test)
예제 #36
0
파일: main.py 프로젝트: agravier/Redbit
def main():
  m = MtGox()
  d = DataSet()
  while True:
    m.updateDataSet(d)
    if m.getSell() > SELL_THRESHOLD:
      m.sellBTC(m.getBTC()*PERCENT_SELL,m.getSell())
    if m.getBuy() < BUY_THRESHOLD:
      m.buyBTC((m.getUSD()*PERCENT_BUY)/m.getBuy(), m.getBuy())
    m.updateDataSet(d)
    os.system("clear")
    m.printTitle()
    d.printData()
    m.printFunds()
    m.printOrders()
    m.printStatus()
    time.sleep(UPDATE_TIME)
예제 #37
0
def per_sentence_bionlp_fscores(test_filename, test_dir, gold_dir):
    import tempfile
    from DataSet import DataSet
    from cStringIO import StringIO
    test_dir = path(test_dir)
    gold_dir = path(gold_dir)

    sentences = DataSet.from_filenames(test_filename)
    for docid, sentences_in_doc in sentences.group_by_metadata('DOC'):
        # if docid != '9015187':
        # if docid != '9081693':
        # if docid != '9257843':
        # if docid != '8108127':
        # if docid != '9115366':
        # if docid != '9361029':
            # continue

        print 'DOC:', docid
        our_total_proposed = 0
        our_total_matched = 0
        for sentence in sentences_in_doc:
            parse = sentence.gold_parse
            sentence.parses = [parse]
            our_score_components = parse.bionlp_fscore_components(sentence)
            matched, gold, proposed = our_score_components
            our_total_proposed += proposed
            our_total_matched += matched

            conll_version = StringIO()
            parse.write_conll(conll_version, include_metadata=False, sentence=sentence)
            conll_version.seek(0)
            conll_version = conll_version.read()

            import BioNLPConversionDB
            converter = BioNLPConversionDB.get_converter()
            bionlp_events_string = converter.convert(conll_version)

            if 1:
                print 'Events ---'
                print bionlp_events_string
                print 'Events ---'
            
            print "Ours:", our_score_components

            temp_test_dir = path(tempfile.mkdtemp(prefix=docid + '-'))
            temp_test_filename = path(temp_test_dir/docid + '.a2.t1')
            temp_test_file = file(temp_test_filename, 'w')
            temp_test_file.write(bionlp_events_string)
            temp_test_file.close()
            real_score_components = real_evaluation_bionlp_components(temp_test_dir, gold_dir)
            print 'Real:', real_score_components 
            if our_score_components != real_score_components:
                real_evaluation_bionlp_components(temp_test_dir, gold_dir, show_output=True)
            temp_test_dir.rmtree()
            if our_score_components != real_score_components:
                raise 'mismatch'
예제 #38
0
def per_sentence_bionlp_fscores_nbest(test_filenames, gold_dir):
    import tempfile
    from DataSet import DataSet
    from cStringIO import StringIO

    gold_dir = path(gold_dir)

    sentences = DataSet.from_filenames(*test_filenames)
    for docid, sentences_in_doc in sentences.group_by_metadata('DOC'):
        # if docid != '9361029':
            # continue

        print 'DOC:', docid
        our_total_proposed = 0
        our_total_matched = 0
        for i, sentence in enumerate(sentences_in_doc):
            for j, parse in enumerate(sentence):
                print "DOC:", docid, 'Sentence:', i, 'Parse:', j
                our_score_components = parse.bionlp_fscore_components(sentence)
                matched, gold, proposed = our_score_components
                our_total_proposed += proposed
                our_total_matched += matched

                conll_version = StringIO()
                parse.write_conll(conll_version, include_metadata=False, sentence=sentence)
                conll_version.seek(0)
                conll_version = conll_version.read()

                import BioNLPConversionDB
                converter = BioNLPConversionDB.get_converter()
                bionlp_events_string = converter.convert(conll_version)

                if 0:
                    print 'Events ---'
                    print bionlp_events_string
                    print 'Events ---'

                temp_test_dir = path(tempfile.mkdtemp(prefix=docid + '-'))
                temp_test_filename = path(temp_test_dir/docid + '.a2.t1')
                temp_test_file = file(temp_test_filename, 'w')
                temp_test_file.write(bionlp_events_string)
                temp_test_file.close()
                real_score_components = real_evaluation_bionlp_components(temp_test_dir, gold_dir)
                if our_score_components != real_score_components:
                    real_evaluation_bionlp_components(temp_test_dir, gold_dir, show_output=True)
                temp_test_dir.rmtree()
                if our_score_components != real_score_components:
                    print "Ours:", our_score_components
                    print 'Real:', real_score_components 

                    print 'Events ---'
                    print bionlp_events_string
                    print 'Events ---'
                    raise 'mismatch'
예제 #39
0
 def __init__(self, *args):
     self.train = DataSet()
     self.test = DataSet()
     self.setOptions(args)
예제 #40
0
 def setOptions(self, arguments):
     for num in range(0, len(arguments[0])):
         if arguments[0][num] == "-t":
             newDataSet = DataSet()
             newDataSet.load(arguments[0][num+1])
             self.train(newDataSet)
예제 #41
0
class knn(Classifier):

    def __init__(self, *args):
        super(knn, self).__init__(*args)
        self.k = 3
        self.instances = DataSet()
        self.setOptions(args)

    def train(self, inDataSet):
        self.instances = inDataSet

    def classify(self, input):
        neighbors = []
        if type(input) == Example:

            for index, item in enumerate(self.instances.getExamples().getExamplesList()):
                if len(neighbors) < self.k:
                    tempNeighbor = neighbor()
                    tempNeighbor.setNeighbor(self.instances.getAttributes().getAttributesList()[self.instances.getAttributes().getClassIndex()].domain[item.values[self.instances.getAttributes().getClassIndex()]], self.distance(input, item))
                    neighbors.append(tempNeighbor)
                else:
                    highestDist = -1
                    highestIndex = -1
                    for num in range(0, len(neighbors)):
                        if num < len(neighbors)-1:
                            if neighbors[num].distance >= neighbors[num + 1].distance:
                                highestDist = neighbors[num].distance
                                highestIndex = num
                            else:
                                highestDist = neighbors[num + 1].distance
                                highestIndex = num + 1
                        elif neighbors[num] < highestDist:
                            highestDist = neighbors[num]
                            highestIndex = num
                    if self.distance(input, self.instances.getExamples().getExamplesList()[index]) < highestDist:
                        newNeighbor = neighbor()
                        newNeighbor.setNeighbor(self.instances.getAttributes().getAttributesList()[self.instances.getAttributes().getClassIndex()].domain[item.values[self.instances.getAttributes().getClassIndex()]], self.distance(input,item))
                        neighbors[highestIndex] = newNeighbor

            return self.vote(neighbors)

        elif type(input) == DataSet:
            rightCount = 0
            for index, item in enumerate(self.instances.getExamples().getExamplesList()):
               if self.classify(self.instances.getExamples().getExamplesList()[index]) == self.instances.getAttributes().getClassAttribute().domain[self.instances.getExamples().getExamplesList()[index].values[self.instances.getExamples().attributes.getClassIndex()]]:
                    rightCount += 1
            performance = Performance()
            performance.setPerf(rightCount, len(self.instances.getExamples().getExamplesList()))
            return performance

    def setOptions(self, arguments):
        for num in range(0, len(arguments[0])):
            if arguments[0][num] == "-k":
                self.k = int(arguments[0][num+1])
            elif arguments[0][num] == "-t":
                newDataSet = DataSet()
                newDataSet.load(arguments[0][num+1])
                self.instances = newDataSet

    def distance(self, observation, example):
        total = 0
        for num in range(0, len(observation.attributes.getAttributesList())-1):
            if observation.values[num] != example.values[num]:
                total += 1
        return total

    def vote(self, neighbors):
        voteDict = {}
        for index, items in enumerate(neighbors):
            if items.classifier in voteDict.keys():
                voteDict[items.classifier] += 1
            else:
                voteDict[items.classifier] = 1
        return max(voteDict, key = voteDict.get)
예제 #42
0
 def __init__(self, *args):
     super(knn, self).__init__(*args)
     self.k = 3
     self.instances = DataSet()
     self.setOptions(args)
예제 #43
0
    def evaluate(self, Classifier, *args):
        proportion = 0
        average = 0.0
        accuracies = []
        performance = Performance()
        trainingSet = DataSet()
        for num in range(0, len(args[0])):
            if args[0][num] == "-t":
                trainingSet.load(args[0][num+1])
            if args[0][num] == "-T":
                testSet = DataSet()
                testSet.load(args[0][num+1])
            if args[0][num] == "-p":
                proportion = float(args[0][num+1])
                for items in range(0, int(proportion * len(trainingSet.getExamples().getExamplesList()))):
                    trainingSet.getExamples().add(trainingSet.getExamples().getExamplesList()[items])
                trainingSet.setAttributes(trainingSet.getAttributes())
                if type(Classifier) == ID3:
                    Classifier.train(trainingSet)
                    performance = Classifier.classify(testSet)
                    return str(performance)
                else:
                    print "Error in Evaluator:evaluate"
                performance = Classifier.classify(testSet)
                return str(performance)

        for num in range(0, self.folds):
            testSet = DataSet()
            trainSet = DataSet()
            for items in trainingSet.getExamples().getExamplesList():
                randomNum = random.randint(0,self.folds-1)
                if randomNum != num:
                    testSet.getExamples().add(items)
                else:
                    trainingSet.getExamples().add(items)
            testSet.setAttributes(trainingSet.getAttributes())
            trainSet.setAttributes(trainingSet.getAttributes())
            if (len(trainingSet.attributes.attributes) > 0):
                trainSet = trainingSet
            Classifier.train(trainSet)
            tempPerformance = Classifier.classify(testSet)
            accuracies.append(tempPerformance.accuracy)
            average += tempPerformance.accuracy
            performance += tempPerformance
        return str(performance) + " +- " + str(self.stdDev(accuracies, average))
예제 #44
0
			@return integer indicating the class of the unknown data.
		"""
		types = {
					"levenshtein": 	Distance().levenshtein, 	"l": Distance().levenshtein,	0: Distance().levenshtein , 			
					"hamming": 		Distance().hamming, 		"h": Distance().hamming,		1: Distance().hamming , 		
					"euclidean": 	Distance().euclidean, 		"e": Distance().euclidean,		2: Distance().euclidean , 			
					"manhattan": 	Distance().manhattan, 		"m": Distance().manhattan,		3: Distance().manhattan , 			
					"chebyshev": 	Distance().chebyshev,		"c": Distance().chebyshev,		4: Distance().chebyshev
				}

		results = [types[distanceType](x.getValue(), data) for x in self.trainset]
		results = [(i,x) for i,x in enumerate(results)]
		kernels = sorted(results, key = lambda x:x[1])[:3]
		kernels = [self.trainset[i].getLabel() for i,x in kernels]
		kernels = [(n, kernels.count(n)) for n in set(kernels)]
		return sorted(kernels, key = lambda x:x[1], reverse = True)[0][0]


if __name__ == "__main__":
	from DataSet import DataSet

	ds = DataSet("C:\\Users\\a5rjqzz\\Desktop\\Python\\pyClassifiers\\data\\IBk\\sample_set_lang.gla")
	bk = IBk()

	bk.train(ds.getExamples())

	kn = ds.convert("y n n")
	cl = bk.classify(kn, 3)

	print cl
	print ds.getAttributes(1)[-1].getLabel(cl)
예제 #45
0
		tp = [(a,b) for a,b in accuracyResults if a == 1 and b ==1]
		tn = [(a,b) for a,b in accuracyResults if a == 0 and b ==0]

		precision = float(len(tp))/(len(tp) + len(fp))
		recall 	  = float(len(tp))/(len(tp) + len(fn))

		return 1./((a*(1/precision))+((1-a)*1/recall))

if __name__ == "__main__":
	from DataSet import DataSet
	from NaiveBayes import NaiveBayes
	from IBk import IBk

	fileIn = "C:\\Users\\a5rjqzz\\Desktop\\Python\\pyClassifiers\\data\\IBk\\sample_set_life.gla"

	ds = DataSet(fileIn)
	nb = NaiveBayes()
	es = Estimator()
	ib = IBk()

	for i in xrange(30):#
		train, test = ds.getTrainTestSet()
		crossValida = ds.getCrossValidationSet(2)

		#nb.train(ds)
		#results = nb.test(test)

		#print es.accuracy(results)

		#ib.train(train)
		#results = ib.test(test)
예제 #46
0
	
	f = dataPath + "IBk\\sample_set_cars.gla"
	f = dataPath + "IBk\\sample_set_tennis.gla"
	#f = dataPath + "IBk\\sample_set_numbers.gla"
	#f = dataPath + "IBk\\sample_set_fish.gla"
	#f = dataPath + "IBk\\sample_set_life.gla"
	#f = dataPath + "IBk\\sample_set_word.gla"
	#f = dataPath + "DataSet_Client Document Preparation for Engine Tuning.gla"
	f = dataPath + "HospitalDocuments.gla"
	f = dataPath + "DataSets\\20160126_1501_ClientSiteData.gla"
	f = dataPath + "DataSets\\20160129_1322_ClientSiteData.gla"
	f = dataPath + "DataSets\\20160129_1358_ClientSiteData.gla"
	f = dataPath + "DataSets\\20160201_1530_ClientSiteData.gla"
	

	ds = DataSet(f)
	dt = DecisionTree()
	es = Estimator()
	pr = Prune()

	a = ds.getAttributes()
	b, c, d = ds.getTrainValidateTestSet(.7) 
	#b, d, c = ds.getTrainValidateTestSet(.7) 
	#b, d = ds.getTrainTestSet() 
	#print len(b), len(c), len(d)
	dt.train(b,a, 4, 3)

	output = dt.test(d)
	print "Single DT on c: {0}%".format(round(es.accuracy(output)*100, 2))

	print "train\t\t", len(b), b.getAllLabels()
		append = results.append

		for example in examples:
			append((self.classify(example.getData()), example.getLabel()))#, example.getLabel(), example.getValues()

		print results
		return results

if __name__=="__main__":
	import matplotlib.pyplot as plt
	from mpl_toolkits.mplot3d import Axes3D
	import random
	import os

	print os.getcwd()
	ds = DataSet("..//..//data//ml//test_weather.gla")

	p = Perceptron(dataset=ds, epochs=10)

	print "Perceptron test:", p.classify([0,0,1,1])
	p.test(ds.getExamples())

	attribute1 = [n for n in xrange(10)]
	attribute2a = [random.sample(range(50)[:35],1)[0] for n in xrange(5)]
	attribute2b = [random.sample(range(50)[20:],1)[0] for n in xrange(5)]

	class0examples = [[attribute1[n], attribute2a[n], 0] for n in xrange(5)]
	class1examples = [[attribute1[n], attribute2b[n], 1] for n in xrange(5)]

	for exs in class1examples+class0examples:
		#print exs
예제 #48
0
 def generateMapFile(self):
     """
     Creates the MapFile object that encodes a map file publishing the 
     complex outputs and writes it to disk.
     
     :returns: string with the path to the map file generated. None if no
     map file was generated (no complex outputs present).
     """
     
     if(self.outputs is None) or (len(self.outputs) != len(self.execution.processOutputs)):
         self.logger.error(self.ERR_08)
         raise Exception(self.ERR_08)
     
     #self.map = UMN.MapFile(self.processId)
     self.map = MapFile(self.processId)
     
     self.map.shapePath    = self.pathFilesGML
     self.map.epsgCode     = self.epsg
     self.map.mapTemplate  = self.mapTemplate
     self.map.imagePath    = self.imagePath
     self.map.imageURL     = self.imageURL
     self.map.mapServerURL = self.mapServerURL
     self.map.mapFilesPath = self.mapFilesPath
     self.map.otherProjs   = self.otherProjs
     
     self.map.meta_fees = self.meta_fees
     self.map.meta_accessconstraints = self.meta_accessconstraints
     self.map.meta_keywordlist = self.meta_keywordlist
     self.map.meta_addresstype = self.meta_addresstype
     self.map.meta_address = self.meta_address
     self.map.meta_city = self.meta_city
     self.map.meta_stateorprovince = self.meta_stateorprovince
     self.map.meta_postcode = self.meta_postcode
     self.map.meta_country = self.meta_country
     self.map.meta_contactelectronicmailaddress = self.meta_contactelectronicmailaddress
     self.map.meta_contactperson = self.meta_contactperson
     self.map.meta_contactorganization = self.meta_contactorganization
     self.map.meta_contactposition = self.meta_contactposition
     self.map.meta_role = self.meta_role
     self.map.meta_contactvoicetelephone = self.meta_contactvoicetelephone
     self.map.meta_contactfacsimiletelephone = self.meta_contactfacsimiletelephone
     self.map.meta_contactinstructions = self.meta_contactinstructions
     self.map.meta_hoursofservice = self.meta_hoursofservice
     
     for output in self.execution.processOutputs:
         
         output.writeToDisk(self.pathFilesGML);
         
         providedTitle = self.outputs[output.identifier]
         dataSet = DataSet(output.filePath, providedTitle, output.identifier)
         self.dataSets.append(dataSet)
         
         layerEPSG = dataSet.getEPSG()
         if (layerEPSG == None):
            layerEPSG = self.map.epsgCode
                                
         if dataSet.dataType == dataSet.TYPE_VECTOR:
             #* style = UMN.MapStyle()
             style = MapStyle()
             #* layer = UMN.VectorLayer(
             layer = VectorLayer(
                 output.filePath, 
                 dataSet.getBBox(), 
                 layerEPSG, 
                 output.identifier,
                 providedTitle)
             type = str(dataSet.getGeometryType())
             if type <> None:
                 layer.layerType = type
             else:
                 layer.layerType = "Polygon"
             self.logger.debug("The layer type: " + str(dataSet.getGeometryType()))
             layer.addStyle(style)
             self.map.addLayer(layer)
             self.logger.debug("Generated layer " + layer.name + " of type " + layer.layerType + ".")
               
         elif dataSet.dataType == dataSet.TYPE_RASTER:
             #layer = UMN.RasterLayer(
             layer = RasterLayer(
                 output.filePath, 
                 dataSet.getBBox(), 
                 layerEPSG, 
                 output.identifier,
                 providedTitle)
             layer.setBounds(dataSet.getMaxValue(), dataSet.getMinValue())
             self.map.addLayer(layer)
             self.logger.debug("Generated layer " + layer.name + " of type raster.")
             
         else:
             self.logger.warning(self.WARN_02 + output.identifier + self.WARN_03)
             
         self.logger.debug("Guessed mime type for this layer: " + str(dataSet.getMimeType()))
         
         print "The pixel res: " + str(dataSet.getPixelRes())
             
     if (len(self.map.layers) > 0):
                 
         try :
             self.map.writeToDisk()
         except Exception, e:
             self.logger.error(self.ERR_07 + str(e))
             raise Exception(self.ERR_07 + str(e))
             return
         
         self.logger.info(self.SUCC_02 + self.map.filePath())
         return self.map.filePath()
예제 #49
0
	def __init__(self, mainDataFile, mediaTypeDataFile):
		self.mainDataFile = mainDataFile
		self.mediaTypeDataFile = mediaTypeDataFile
		DataSet.__init__(self)
예제 #50
0
 def translate_email_data(self, email_data: EmailData):
     rv = DataSet()
     for email in email_data.emails:
         rv.add_data(self.translate_email(email))
     return rv
예제 #51
0
	def setUp(self):
		self.dataSet = DataSet("Atvinnuleysi", "v28", "Year")
		self.timeline = TimeLine("Title1", "id1")
예제 #52
0
    arg = sys.argv[i]
    # all args should have an equal sign
    equal_index = arg.find('=')
    if equal_index != -1:
        first = arg[0:equal_index]
        second = arg[equal_index+1:len(arg)]
        if first == '--ratio':
            trainingRatio = float(second)      
        elif first == '--method':
            method = second
        elif first == '--n':
            N = int(second)
        elif first == '--iterations':
            iterations = int(second)
        else:
            dataset = DataSet(first, second)
            datasets.append(dataset)
            print("Finished adding composer: %s, with data: %s" % (dataset.composerName, dataset.dataPath))
    else:
        print "Couldn't parse arg", arg

if method == "ngram":
    print "Using Ngram method with N =", N

    numCorrectMozart = 0
    numPredictedMozart = 0
    numCorrectHaydn = 0
    numPredictedHaydn = 0
    for j in range(0, iterations):
        training, testing, composers = zip(*[dataset.getTrainingAndTestingSets(trainingRatio, 20)
                                             for dataset in datasets])