def check_classifiers_classes(name, Classifier):
    X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1
    y_names = np.array(["one", "two", "three"])[y]

    for y_names in [y_names, y_names.astype('O')]:
        if name in ["LabelPropagation", "LabelSpreading"]:
            # TODO some complication with -1 label
            y_ = y
        else:
            y_ = y_names

        classes = np.unique(y_)
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            classifier = Classifier()
        if name == 'BernoulliNB':
            classifier.set_params(binarize=X.mean())
        set_fast_parameters(classifier)
        # fit
        classifier.fit(X, y_)

        y_pred = classifier.predict(X)
        # training set performance
        assert_array_equal(np.unique(y_), np.unique(y_pred))
        if np.any(classifier.classes_ != classes):
            print("Unexpected classes_ attribute for %r: "
                  "expected %s, got %s" %
                  (classifier, classes, classifier.classes_))
Example #2
0
def encode_features(X, enc_map=None):
    """Converts categorical values in each column of X to integers in the range
    [0, n_unique_values_in_column - 1], if X is not already of integer type.

    If mapping is not provided, it is calculated based on the values in X.

    Unknown values during prediction get a value of -1. np.NaNs are ignored
    during encoding, and get treated as unknowns during prediction.
    """
    if np.issubdtype(X.dtype, np.integer):
        # Already integer type, so we can take a shortcut. Simply reshape
        # the data to mapping dictionaries, and do nothing with X.
        enc_map = [{val: val for val in np.unique(col)} for col in X.T]
        return X, enc_map

    if enc_map is None:
        fit = True
        # We will calculate enc_map, so initialize the list of column mappings.
        enc_map = []
    else:
        fit = False

    Xenc = np.zeros(X.shape).astype('int')
    for ii in range(X.shape[1]):
        if fit:
            col_enc = {val: jj for jj, val in enumerate(np.unique(X[:, ii]))
                       if not (isinstance(val, float) and np.isnan(val))}
            enc_map.append(col_enc)
        # Unknown categories (including np.NaNs) all get a value of -1.
        Xenc[:, ii] = np.array([enc_map[ii].get(x, -1) for x in X[:, ii]])

    return Xenc, enc_map
Example #3
0
def training_stage3(dftrain,dfvalid,cat1,i):
    fname = ddir + 'joblib/stage3_'+str(cat1)+ext
    df = dftrain[dftrain.Categorie1 == cat1].reset_index(drop=True)
    dfv = dfvalid[dfvalid.Categorie1 == cat1].reset_index(drop=True)
    labels = np.unique(df.Categorie3)
    if len(labels)==1:
        joblib.dump((labels,None,None),fname)
        scv = -1
        sct = -1
        print 'training',cat1,'\t\t(',i,') : N=',len(df),'K=',len(labels)
        print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv
        return (sct,scv)
    vec,X = vectorizer_stage3(df.txt)
    Y = df['Categorie3'].values
    cla = LogisticRegression(C=best_regularisation.get(cat1,100))
    cla.fit(X,Y)
    labels = np.unique(df.Categorie3)
    sct = cla.score(X[:min(10000,len(df))],Y[:min(10000,len(df))])
    if len(dfv)==0:
        scv = -1
    else:
        Xv = vec.transform(dfv.txt)
        Yv = dfv['Categorie3'].values
        scv = cla.score(Xv,Yv)
    print 'training',cat1,'\t\t(',i,') : N=',len(df),'K=',len(labels)
    print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv
    joblib.dump((labels,vec,cla),fname)
    del vec,cla
    return (sct,scv)
Example #4
0
 def test_06_04_ijv_to_labels_overlapping(self):
     '''Convert an ijv representation with overlap to labels'''
     ijv = np.array([[1, 1, 1],
                     [1, 2, 1],
                     [2, 1, 1],
                     [2, 2, 1],
                     [1, 3, 2],
                     [2, 3, 2],
                     [2, 3, 3],
                     [4, 4, 4],
                     [4, 5, 4],
                     [4, 5, 5],
                     [5, 5, 5]])
     x = cpo.Objects()
     x.ijv = ijv
     labels = x.get_labels()
     self.assertEqual(len(labels), 2)
     unique_a = np.unique(labels[0][0])[1:]
     unique_b = np.unique(labels[1][0])[1:]
     for a in unique_a:
         self.assertTrue(a not in unique_b)
     for b in unique_b:
         self.assertTrue(b not in unique_a)
     for i, j, v in ijv:
         mylabels = labels[0][0] if v in unique_a else labels[1][0]
         self.assertEqual(mylabels[i, j], v)
Example #5
0
    def by_lblimg(self, lbldata):
        """
        Get specific template regions by rois given by user
        All regions overlapped with a specific label region will be covered

        Parameters:
        -----------
        lbldata: rois given by user

        Return:
        -------
        out_template: new template contains part of regions
                      if lbldata has multiple different rois, then new template will extract regions with each of roi given by user

        Example:
        --------
        >>> glr_cls = GetLblRegion(template)
        >>> out_template = glr_cls.by_lblimg(lbldata)
        """
        assert lbldata.shape == self._template.shape, "the shape of template should be equal to the shape of lbldata"
        labels = np.sort(np.unique(lbldata)[1:]).astype('int')
        out_template = np.zeros_like(lbldata)
        out_template = out_template[...,np.newaxis]
        out_template = np.tile(out_template, (1, len(labels)))
        for i,lbl in enumerate(labels):
            lbldata_tmp = tools.get_specificroi(lbldata, lbl)
            lbldata_tmp[lbldata_tmp!=0] = 1
            part_template = self._template*lbldata_tmp
            template_lbl = np.sort(np.unique(part_template)[1:])
            out_template[...,i] = tools.get_specificroi(self._template, template_lbl)
        return out_template
Example #6
0
    def evaluate(self):
        '''
        Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
        :return: None
        '''
        tic = time.time()
        print 'Running per image evaluation...      '
        p = self.params
        p.imgIds = list(np.unique(p.imgIds))
        if p.useCats:
            p.catIds = list(np.unique(p.catIds))
        p.maxDets = sorted(p.maxDets)
        self.params=p

        self._prepare()
        # loop through images, area range, max detection number
        catIds = p.catIds if p.useCats else [-1]

        computeIoU = self.computeIoU
        self.ious = {(imgId, catId): computeIoU(imgId, catId) \
                        for imgId in p.imgIds
                        for catId in catIds}

        evaluateImg = self.evaluateImg
        maxDet = p.maxDets[-1]
        self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet)
                 for catId in catIds
                 for areaRng in p.areaRng
                 for imgId in p.imgIds
             ]
        self._paramsEval = copy.deepcopy(self.params)
        toc = time.time()
        print 'DONE (t=%0.2fs).'%(toc-tic)
Example #7
0
    def _pick_sources(self, data, include, exclude, eid):
        """Aux method."""
        fast_dot = _get_fast_dot()
        if exclude is None:
            exclude = self.exclude
        else:
            exclude = list(set(list(self.exclude) + list(exclude)))

        logger.info('Transforming to Xdawn space')

        # Apply unmixing
        sources = fast_dot(self.filters_[eid].T, data)

        if include not in (None, []):
            mask = np.ones(len(sources), dtype=np.bool)
            mask[np.unique(include)] = False
            sources[mask] = 0.
            logger.info('Zeroing out %i Xdawn components' % mask.sum())
        elif exclude not in (None, []):
            exclude_ = np.unique(exclude)
            sources[exclude_] = 0.
            logger.info('Zeroing out %i Xdawn components' % len(exclude_))
        logger.info('Inverse transforming to sensor space')
        data = fast_dot(self.patterns_[eid], sources)

        return data
Example #8
0
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    # plot class samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=cmap(idx),
                    marker=markers[idx], label=cl)

    # Highlight test samples
    if test_idx:
        X_test, y_test = X[test_idx, :], y[test_idx]
        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    alpha=1.0,
                    linewidths=1,
                    marker='o',
                    s=55, label='test set')
Example #9
0
def _run_alg(data, agg_col, cat_cols, model, null_responses):
    """Runs an outlier detection algorithm, taking the model to use as input.
    
    Args:
        data: numpy.recarray or pandas.DataFrame containing the data.
        agg_col: string giving the name of aggregation unit column.
        cat_cols: list of the categorical column names for which outlier values should be computed.
        model: object implementing a compute_outlier_scores() method as described in the comments
            in the models section.
        null_responses: list of strings that should be considered to be null responses, i.e.,
            responses that will not be included in the frequency counts for a column.  This can
            be useful if, for example, there are response values that mean a question has been
            skipped.
    
    Returns:
        A dictionary of dictionaries, mapping (aggregation unit) -> (column name) ->
        (outlier score).
    """
    agg_units = sorted(np.unique(data[agg_col]))
    outlier_scores = collections.defaultdict(dict)
    for col in cat_cols:
        col_vals = sorted(np.unique(data[col]))
        col_vals = [c for c in col_vals if c not in null_responses]
        frequencies = {}
        for agg_unit in agg_units:
            frequencies[agg_unit] = _get_frequencies(data, col, col_vals, agg_col, agg_unit)
        outlier_scores_for_col = model.compute_outlier_scores(frequencies)
        for agg_unit in agg_units:
            outlier_scores[agg_unit][col] = outlier_scores_for_col[agg_unit]
    return outlier_scores
Example #10
0
def seems_like_discrete_data(arr, dictionary=None):
	if numpy.issubdtype(arr.dtype, numpy.bool_):
		#print('seems_like_discrete_data? YES bool')
		return True
	else:
		pass
		#print('seems_like_discrete_data? not bool but',arr.dtype)
	if dictionary is None:
		if len(numpy.unique(arr[:100]))<6:
			if len(numpy.unique(arr[:1000])) < 6:
				if len(numpy.unique(arr)) < 6:
					#print('seems_like_discrete_data? YES uniques < 6')
					return True
		#print('seems_like_discrete_data? too many and no dictionary')
	else:
		uniq = numpy.unique(arr)
		not_in_dict = 0
		for i in uniq:
			if i not in dictionary:
				not_in_dict += 1
		if not_in_dict > 2:
			#print(f'seems_like_discrete_data? dictionary but {not_in_dict} missing keys')
			return False
		else:
			#print(f'seems_like_discrete_data? dictionary with {not_in_dict} missing keys')
			return True
	return False
def getWords(imageloc,finalBoundingboxesFiltered):
	img=io.imread(imageloc)
	imgray=color.rgb2gray(img)
	horizontaldistances=[]
	verticaldistances=[]
	imgwidth=Image.open(imageloc).size[0]

	for i in range(0,len(finalBoundingboxesFiltered)):
		for j in range (i+1,len(finalBoundingboxesFiltered)):
			item1=finalBoundingboxesFiltered[i]
			item2=finalBoundingboxesFiltered[j]
			h=getDistHorizontal(item1,item2)
			v=getDistVertical(item1,item2)
			if h!=0:
				horizontaldistances.append(h)
			if v!=0:
				verticaldistances.append(v)
	global HORIZONTALTHRESHOLD
	global VERTICALTHRESHOLD

	#print horizontaldistances,verticaldistances
	HORIZONTALTHRESHOLD=sorted(np.unique(horizontaldistances))[2]+1
	VERTICALTHRESHOLD=sorted(np.unique(verticaldistances))[1]+1
	print "using horizontal and vertical thresholds",HORIZONTALTHRESHOLD,VERTICALTHRESHOLD
	nomerges=1
	while(nomerges):
		(finalBoundingboxesFiltered,nomerges)=mergeOnce(imgray,finalBoundingboxesFiltered,imgwidth)	

	finalBoundingboxes=finalBoundingboxesFiltered
	return finalBoundingboxes	
Example #12
0
    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(s.value_counts(), expected)

            if isinstance(s, Index):
                exp = Index(np.unique(np.array(s_values, dtype=np.object_)))
                tm.assert_index_equal(s.unique(), exp)
            else:
                exp = np.unique(np.array(s_values, dtype=np.object_))
                tm.assert_numpy_array_equal(s.unique(), exp)

            assert s.nunique() == 4
            # don't sort, have to sort after the fact as not sorting is
            # platform-dep
            hist = s.value_counts(sort=False).sort_values()
            expected = Series([3, 1, 4, 2], index=list('acbd')).sort_values()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list('cdab'))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(hist, expected)
Example #13
0
def mask_rowcols(a, axis=None):
    """
    Mask whole rows and/or columns of a 2D array that contain
    masked values.  The masking behavior is selected with the
    `axis` parameter.

        - If axis is None, rows and columns are masked.
        - If axis is 0, only rows are masked.
        - If axis is 1 or -1, only columns are masked.

    Parameters
    ----------
    axis : int, optional
        Axis along which to perform the operation.
        If None, applies to a flattened version of the array.

    Returns
    -------
     a *pure* ndarray.

    """
    a = asarray(a)
    if a.ndim != 2:
        raise NotImplementedError, "compress2d works for 2D arrays only."
    m = getmask(a)
    # Nothing is masked: return a
    if m is nomask or not m.any():
        return a
    maskedval = m.nonzero()
    a._mask = a._mask.copy()
    if not axis:
        a[np.unique(maskedval[0])] = masked
    if axis in [None, 1, -1]:
        a[:, np.unique(maskedval[1])] = masked
    return a
Example #14
0
    def __init__(self, filename, diets = False, ctrlgrp = 99):
        self.rawdata = pd.read_csv(filename, sep=" ")
        self.rawdata['days'] = self.rawdata['days']/365.0 - 1.0 # scale days

        # select subgroups
        if diets == False:  # select all diet groups
            self.data = self.rawdata
        else:
            self.data = self.rawdata[np.in1d(self.rawdata['diet'], diets)]

        # set parameters
        self.unidays = np.unique(self.data['days'])
        self.unidiets = np.unique(self.data['diet'])
        self.ctrlidx = np.where(self.unidiets == ctrlgrp)[0][0]
        self.uniids = np.unique(self.data['id'])
        self.grp = self.unidiets.size # total number of diets
        self.ntot = self.uniids.size # total number of mouse
        self.grp_uniids = {}
        self.grp_ntot = {}
        self.grp_dtot = {}
        for g in self.unidiets:
            temp = self.data['id'][self.data['diet']==g]
            self.grp_uniids.update({g: np.unique(temp)})
            # number of total number of measurements in a group
            self.grp_dtot.update({g: temp.size})
            # number of unique ids in a group
            self.grp_ntot.update({g: self.grp_uniids[g].size})
        self.id_dtot = {}
        for i in self.uniids:
            temp = self.data['days'][self.data['id']==i]
            # number of measurements for each ids
            self.id_dtot.update({i: temp.size})
def promiscuity(S):
    '''
    PROMISCUITY      Promiscuity coefficient
    P = PROMISCUITY(S) calculates the promiscuity coefficient. The
    promiscuity of a temporal or multislice network corresponds to the
    fraction of all the communities in the network in which a node
    participates at least once.
    Inputs:     S,      pxn matrix of community assignments where p is the
                       number of slices/layers and n the number of nodes
    Outputs:    P,      Promiscuity coefficient
    Other m-files required: none
    Subfunctions: none
    MAT-files required: none
    _______________________________________________
    Marcelo G Mattar (08/21/2014)
    '''
    S = np.asarray(S)
    numNodes = np.shape(S)[1]
    numCommunities = len(np.unique(S))
    P = np.zeros((numNodes,1),dtype = np.double)

    for n in range(numNodes):

        # Notice that P=0 if it only participates in one community and P=1 if
        # it participates in every community of the network

        P[n,0] = np.double((len(np.unique(S[:,n]))-1)) / (numCommunities-1)

    return P
Example #16
0
def main():
    '''Train tha model and evaluate performance'''

    '''Load the MNIST training data. Also flatten the images from 28X28 arrays to a single vector'''
    images, labels = amitgroup.io.mnist.load_mnist('training', path='./', asbytes=True)
    images = [image.ravel() for image in images]

    '''Find unique labels and which are the first images that correspnd to them'''
    indices = unique(labels, return_index=True)[1]

    '''Create the clustering engine. Use the unique images found above as centers'''
    clustering = Kmeans()
    clustering.train(data=images, centers=take(images, indices, axis=0), max_iterations=100)

    '''Load the testing data set and flatten the images'''
    test_images, test_labels = amitgroup.io.load_mnist('testing', path='./', asbytes=True)
    test_images = [image.ravel() for image in test_images]

    '''Assign the test data to clusters and evaluate the performance'''
    predictions = [clustering.cluster(image) for image in test_images]
    success = (predictions == test_labels)
    correct, counts = unique(success, return_counts=True)

    print('{} of the testing data set where put in the wrong cluster'.format(counts[0]))

    plot_images_separately([reshape(center, (28,28)) for center in clustering.centers])
Example #17
0
def detect_duplicates(file_name, dist_thr=0.1, FOV=(512, 512)):
    """
    Removes duplicate ROIs from file file_name

    Parameters:
    -----------
        file_name:  .zip file with all rois

        dist_thr:   distance threshold for duplicate detection

        FOV:        dimensions of the FOV

    Returns:
    --------
        duplicates  : list of indeces with duplicate entries

        ind_keep    : list of kept indeces

    """
    rois = nf_read_roi_zip(file_name, FOV)
    cm = [scipy.ndimage.center_of_mass(mm) for mm in rois]
    sp_rois = scipy.sparse.csc_matrix(
        np.reshape(rois, (rois.shape[0], np.prod(FOV))).T)
    D = distance_masks([sp_rois, sp_rois], [cm, cm], 10)[0]
    np.fill_diagonal(D, 1)
    indeces = np.where(D < dist_thr)      # pairs of duplicate indeces

    ind = list(np.unique(indeces[1][indeces[1] > indeces[0]]))
    ind_keep = list(set(range(D.shape[0])) - set(ind))
    duplicates = list(np.unique(np.concatenate((indeces[0], indeces[1]))))

    return duplicates, ind_keep
Example #18
0
def clf_svm(X, Y):
    print np.unique(Y)

    model = svm.SVC()
    print model.fit(X[:-1], Y[:-1])
    print model.predict(X[-1:])
    print Y[-1]
Example #19
0
 def add_sgd_class(self, word, example):
    self.clfColor = SGDClassifier(loss="log", penalty="l2")
    self.clfShape = SGDClassifier(loss="log", penalty="l2")
    X_Color = [example['Color']]
    y_Color = [word]
    X_Shape = [example['Shape']]
    y_Shape = [word]
    for word in self.knownWords.keys():
       for classifier in self.knownWords[word]:
          if("Synonym" not in str(type(classifier))):
             examples = classifier.positiveExamples
             for ex in examples : 
                if("Color" in classifier._type_):
                   X_Color.append(ex['Color'])
                   y_Color.append(word)
                if("Shape" in classifier._type_):
                   X_Shape.append(ex['Shape'])
                   y_Shape.append(word)
    
    classes = np.unique(y_Color)
    self.clfColor.partial_fit(X_Color, y_Color,classes=classes)
    self.classColors = classes
    classes = np.unique(y_Shape)
    self.clfShape.partial_fit(X_Shape, y_Shape,classes=classes)
    self.classShapes = classes
Example #20
0
def print_results(labels, predictions):
    total = len(labels)
    num_correct = total - np.count_nonzero(np.subtract(predictions,labels))
    print "\n***** ACCURACY *****"
    print "Overall Accuracy: %.3f percent\n" % ((float(num_correct)/float(total)) * 100.0)

    results = pd.DataFrame()
    results['real'] = labels
    results['predicted'] = predictions

    for label in np.unique(labels):
        data = results[results['real'] == label]
        num_correct = len(data) - np.count_nonzero(data['real'].sub(data['predicted']))
        acc = ((float(num_correct)/float(len(data))) * 100.0)
        print "Total class label '%s' accuracy: %f percent" % (label, acc)
    print ""

    # Distribution graphs
    utils.print_distribution_graph(labels, 'Actual Distribution of Classes')
    utils.print_distribution_graph(predictions, 'Distribution of Predictions')

    # Distribution graphs for each class label
    for label in np.unique(labels):
        data = results[results['predicted'] == label]['real'].tolist()
        title = "When class label '%s' was predicted, the actual class was:" % label
        utils.print_distribution_graph(data, title)
Example #21
0
def test_volatility_communities():
    # Test volatility
    G = np.zeros([4, 4, 3])
    G[0, 1, [0, 2]] = 1
    G[2, 3, [0, 1]] = 1
    G[1, 2, [1, 2]] = 1
    G = G + G.transpose([1, 0, 2])
    communities = [0, 0, 1, 1]
    # global volatility
    v_bet = teneto.networkmeasures.volatility(
        G, calc='betweencommunities', communities=communities)
    v_within = teneto.networkmeasures.volatility(
        G, calc='withincommunities', communities=communities)
    v_communities = teneto.networkmeasures.volatility(
        G, calc='communities', communities=communities)
    if not len(v_bet) == G.shape[-1] - 1:
        raise AssertionError()
    if not len(v_within) == G.shape[-1] - 1:
        raise AssertionError()
    if not np.all(v_communities.shape == (
        len(np.unique(communities)), len(np.unique(communities)), G.shape[-1] - 1)):
        raise AssertionError()
    # Hardcode answer due to hamming distance and predefined matrix
    if not np.all(v_within == [0.5, 1]):
        raise AssertionError()
    if not np.all(v_bet == [0.25, 0]):
        raise AssertionError()
    if not np.all(v_communities[:, :, 0] == np.array([[1, 0.25], [0.25, 0]])):
        raise AssertionError()
    if not np.all(v_communities[:, :, 1] == np.array([[1, 0], [0, 1]])):
        raise AssertionError()
def test_ward_clustering():
    """
    Check that we obtain the correct number of clusters with Ward clustering.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(100, 50)
    connectivity = grid_to_graph(*mask.shape)
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    clustering.fit(X)
    # test caching
    clustering = Ward(n_clusters=10, connectivity=connectivity,
                      memory=mkdtemp())
    clustering.fit(X)
    labels = clustering.labels_
    assert_true(np.size(np.unique(labels)) == 10)
    # Turn caching off now
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    # Check that we obtain the same solution with early-stopping of the
    # tree building
    clustering.compute_full_tree = False
    clustering.fit(X)
    np.testing.assert_array_equal(clustering.labels_, labels)
    clustering.connectivity = None
    clustering.fit(X)
    assert_true(np.size(np.unique(clustering.labels_)) == 10)
    # Check that we raise a TypeError on dense matrices
    clustering = Ward(n_clusters=10,
                      connectivity=connectivity.todense())
    assert_raises(TypeError, clustering.fit, X)
    clustering = Ward(n_clusters=10,
                      connectivity=sparse.lil_matrix(
                          connectivity.todense()[:10, :10]))
    assert_raises(ValueError, clustering.fit, X)
def kmeans(xx, centroids, maxIters = 20, minclust=30, maxDiff = 2):

  # Cluster Assignment step
  ca = np.array([np.argmin([np.dot(x_i-y_k, x_i-y_k) for y_k in centroids]) for x_i in xx])
  # all clusters have at least minclust?
  (unique, counts) = np.unique(ca, return_counts=True)
  for cc in counts:
    if cc < minclust:
      return("error: too few", np.array(centroids), ca)
  # Move centroids step
  centroids = np.array([xx[ca == k].mean(axis = 0) for k in range(centroids.shape[0])])

  iter=1
  while (iter<maxIters):
      # Cluster Assignment step
      canew = np.array([np.argmin([np.dot(x_i-y_k, x_i-y_k) for y_k in centroids]) for x_i in xx])
      # all clusters have at least minclust?
      (unique, counts) = np.unique(canew, return_counts=True)
      for cc in counts:
        if cc < minclust:
          return("error: too few", np.array(centroids), canew)
      numdiff = sum(ca != canew)
      if numdiff < maxDiff:
        return("converged", np.array(centroids), canew)
      ca = canew
      # Move centroids step
      centroids = np.array([xx[ca == k].mean(axis = 0) for k in range(centroids.shape[0])])
      iter += 1

  return("error: not converged", np.array(centroids), ca)
Example #24
0
	def _get_obs_index_groups(self):
		" Computes index groups for given observation scheme. "

		J = np.zeros((self._p, self.num_subpops), dtype=bool) 

		def any_observed(x):
			return x.size > 0

		for i in np.where(map(any_observed, self._sub_pops))[0]:
			J[self._sub_pops[i],i] = 1

		twoexp = np.power(2,np.arange(self.num_subpops))
		hsh = np.sum(J*twoexp,1)                     

		lbls = np.unique(hsh)

		idx_grp = []
		for i in range(lbls.size):
			idx_grp.append(np.where(hsh==lbls[i])[0])

		obs_idx = []
		for i in range(self.num_obstime):
			obs_idx.append([])
			for j in np.unique(hsh[np.where(J[:,self._obs_pops[i]]==1)]):
				obs_idx[i].append(np.where(lbls==j)[0][0])            

		return tuple(obs_idx), tuple(idx_grp)
def makeThresholdMap(image, findCars, scales=[1.5], percentOfHeapmapToToss=.5):
    print("scales:", scales, ", type:", type(scales), "image.shape:", image.shape, ", dtype:", image.dtype, ", percentOfHeapmapToToss:", percentOfHeapmapToToss)
    boundingBoxList=[]
    boundingBoxWeights=[]
    for scale in scales:
        listOfBoundingBoxes, listOfWeights = findCars(image, scale)
        boundingBoxList+=listOfBoundingBoxes
        boundingBoxWeights+=listOfWeights

    if USEBOUNDINGBOXWEIGHTS:
        unNormalizedHeatMap=addWeightedHeat(image.shape, boundingBoxList, boundingBoxWeights)
    else:
        unNormalizedHeatMap=addHeat(image.shape, boundingBoxList)

    if USESTACKOFHEATMAPS:
        unNormalizedHeatMap,_=totalHeatmapStack(unNormalizedHeatMap)


    unNormalizedHeatMapCounts=np.unique(unNormalizedHeatMap, return_counts=True)
    if TESTING: print("makeThresholdMap-unNormalizedHeatMapCounts:", unNormalizedHeatMapCounts, ", len(unNormalizedHeatMapCounts):", len(unNormalizedHeatMapCounts), ", len(unNormalizedHeatMapCounts[0]):", len(unNormalizedHeatMapCounts[0]))
    unNormalizedHeatMapMidpoint=unNormalizedHeatMapCounts[0][int(round(len(unNormalizedHeatMapCounts[0])*percentOfHeapmapToToss))]
    thresholdMap=applyThreshold(unNormalizedHeatMap, unNormalizedHeatMapMidpoint)
    print("makeThresholdMap-max(thresholdMap):", np.max(thresholdMap), ", min(thresholdMap):", np.min(thresholdMap))
    if TESTING: print("makeThresholdMap-thresholdMap counts:", (np.unique(thresholdMap, return_counts=True)), ", len(thresholdMap):", len(thresholdMap), ", len(thresholdMap[0]):", len(thresholdMap[0]))
    normalizedMap=normalizeMap(thresholdMap)
    if TESTING: print("makeThresholdMap-normalizedMap counts:", (np.unique(normalizedMap, return_counts=True)), ", len(normalizedMap):", len(normalizedMap), ", len(normalizedMap[0]):", len(normalizedMap[0]))
    print("makeThresholdMap-max(normalizedMap):", np.max(normalizedMap), ", min(normalizedMap):", np.min(normalizedMap))
    return normalizedMap, boundingBoxList, unNormalizedHeatMap, boundingBoxWeights
Example #26
0
def spatio_temporal_src_connectivity(src, n_times):
    """Compute connectivity for a source space activation over time

    Parameters
    ----------
    src : source space
        The source space.

    n_times : int
        Number of time instants

    Returns
    -------
    connectivity : sparse COO matrix
        The connectivity matrix describing the spatio-temporal
        graph structure. If N is the number of vertices in the
        source space, the N first nodes in the graph are the
        vertices are time 1, the nodes from 2 to 2N are the vertices
        during time 2, etc.

    """
    if src[0]['use_tris'] is None:
        raise Exception("The source space does not appear to be an ico "
                        "surface. Connectivity cannot be extracted from "
                        "non-ico source spaces.")
    lh_tris = np.searchsorted(np.unique(src[0]['use_tris']),
                              src[0]['use_tris'])
    rh_tris = np.searchsorted(np.unique(src[1]['use_tris']),
                              src[1]['use_tris'])
    tris = np.concatenate((lh_tris, rh_tris + np.max(lh_tris) + 1))
    return spatio_temporal_tris_connectivity(tris, n_times)
def evaluateSpeakerDiarization(flags, flagsGT):

	minLength = min( flags.shape[0], flagsGT.shape[0] )
	flags = flags[0:minLength]
	flagsGT = flagsGT[0:minLength]

	uFlags = numpy.unique(flags)
	uFlagsGT = numpy.unique(flagsGT)	

	# compute contigency table:
	cMatrix = numpy.zeros(( uFlags.shape[0], uFlagsGT.shape[0] ))
	for i in range(minLength):
		cMatrix[ int(numpy.nonzero(uFlags==flags[i])[0]), int(numpy.nonzero(uFlagsGT==flagsGT[i])[0]) ] += 1.0

	Nc, Ns = cMatrix.shape;
	N_s = numpy.sum(cMatrix,axis=0);
	N_c = numpy.sum(cMatrix,axis=1);
	N   = numpy.sum(cMatrix);

	purityCluster = numpy.zeros( (Nc,) )
	puritySpeaker = numpy.zeros( (Ns,) )
	# compute cluster purity:
	for i in range(Nc):
		purityCluster[i] = numpy.max( (cMatrix[i,:]) )/ (N_c[i]);

	for j in range(Ns):
		puritySpeaker[j] = numpy.max( (cMatrix[:,j]) )/ (N_s[j]);

	purityClusterMean = numpy.sum(purityCluster*N_c) / N;
	puritySpeakerMean = numpy.sum(puritySpeaker*N_s) / N;
	
	return purityClusterMean, puritySpeakerMean
Example #28
0
def check_classifiers_classes(name, Classifier, X, y, y_names):
    if name in ["LabelPropagation", "LabelSpreading"]:
        # TODO some complication with -1 label
        y_ = y
    else:
        y_ = y_names

    classes = np.unique(y_)
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        classifier = Classifier()
    # fit
    try:
        classifier.fit(X, y_)
    except Exception as e:
        print(e)

    y_pred = classifier.predict(X)
    # training set performance
    assert_array_equal(np.unique(y_), np.unique(y_pred))
    accuracy = accuracy_score(y_, y_pred)
    assert_greater(accuracy, 0.78,
                   "accuracy %f of %s not greater than 0.78"
                   % (accuracy, name))
    #assert_array_equal(
        #clf.classes_, classes,
        #"Unexpected classes_ attribute for %r" % clf)
    if np.any(classifier.classes_ != classes):
        print("Unexpected classes_ attribute for %r: "
              "expected %s, got %s" %
              (classifier, classes, classifier.classes_))
Example #29
0
def check_and_set_idx(ids, idx, prefix):
    """ Reconciles passed-in IDs and indices and returns indices, as well as unique IDs
    in the order specified by the indices.  If only IDs supplied, returns the sort-arg
    as the index.  If only indices supplied, returns None for IDs.  If both supplied,
    checks that the correspondence is unique and returns unique IDs in the sort order of
    the associated index.
    :param np.ndarray ids: array of IDs
    :param np.ndarray[int] idx: array of indices
    :param str prefix: variable name (for error logging)
    :return: unique IDs and indices (passed in or derived from the IDs)
    :rtype: np.ndarray, np.ndarray
    """
    if ids is None and idx is None:
        raise ValueError('Both {}_ids and {}_idx cannot be None'.format(prefix, prefix))
    if ids is None:
        return None, np.asarray_chkfinite(idx)
    if idx is None:
        return np.unique(ids, return_inverse=True)
    else:
        ids = np.asarray(ids)
        idx = np.asarray_chkfinite(idx)
        if len(idx) != len(ids):
            raise ValueError('{}_ids ({}) and {}_idx ({}) must have the same length'.format(
                prefix, len(ids), prefix, len(idx)))
        uniq_idx, idx_sort_index = np.unique(idx, return_index=True)
        # make sure each unique index corresponds to a unique id
        if not all(len(set(ids[idx == i])) == 1 for i in uniq_idx):
            raise ValueError("Each index must correspond to a unique {}_id".format(prefix))
        return ids[idx_sort_index], idx
Example #30
0
    def fit(self,X,y=None):
        """Fit a model: 

        Parameters
        ----------

        X : pandas dataframe or array-like
           training samples. If pandas dataframe can handle dict of feature in one column or cnvert a set of columns
        y : array like, required for array-like X and not used presently for pandas dataframe
           class labels

        Returns
        -------
        self: object


        """
        if isinstance(X,pd.DataFrame):
            df = X
            if not self.dict_feature is None:
                if not self.target_readable is None:
                    self.create_class_id_map(df,self.target,self.target_readable)
                (X,y) = self._load_from_dict(df)
                num_class = len(np.unique(y))
            else:
                (X,y,self.vectorizer) = self.convert_numpy(df)
                num_class = len(y.unique())
        else:
            check_X_y(X,y)
            num_class = len(np.unique(y))

        self.clf = xgb.XGBClassifier(**self.params)
        print self.clf.get_params(deep=True)
        self.clf.fit(X,y,verbose=True)
        return self
Example #31
0
def fit(X, cluster='agglomerative', metric='euclidean', linkage='ward', min_clust=2, max_clust=25, Z=None, savemem=False, verbose=3):
    """ Determine optimal number of clusters using dbindex.

    Description
    -----------
    This function return the cluster labels for the optimal cutt-off based on the choosen hierarchical clustering method.

    Parameters
    ----------
    X : Numpy-array.
        The rows are the features and the colums are the samples.
    cluster : str, (default: 'agglomerative')
        Clustering method type for clustering.
            * 'agglomerative'
            * 'kmeans'
    metric : str, (default: 'euclidean').
        Distance measure for the clustering, such as 'euclidean','hamming', etc.
    linkage : str, (default: 'ward')
        Linkage type for the clustering.
        'ward','single',',complete','average','weighted','centroid','median'.
    min_clust : int, (default: 2)
        Minimum number of clusters (>=).
    max_clust : int, (default: 25)
        Maximum number of clusters (<=).
    Z : Object, (default: None).
        This will speed-up computation if you readily have Z. e.g., Z=linkage(X, method='ward', metric='euclidean').
    savemem : bool, (default: False)
        Save memmory when working with large datasets. Note that htis option only in case of KMeans.
    verbose : int, optional (default: 3)
        Print message to screen [1-5]. The larger the number, the more information.

    Returns
    -------
    dict. with various keys. Note that the underneath keys can change based on the used methodtype.
    method: str
        Method name that is used for cluster evaluation.
    score: pd.DataFrame()
        The scoring values per clusters.
    labx: list
        Cluster labels.
    fig: list
        Relevant information to make the plot.

    Examples
    --------
    >>> # Import library
    >>> import clusteval.dbindex as dbindex
    >>> from sklearn.datasets import make_blobs
    >>> Generate demo data
    >>> X, labels_true = make_blobs(n_samples=750, centers=6, n_features=10)
    >>> # Fit with default parameters
    >>> results = dbindex.fit(X)
    >>> # plot
    >>> dbindex.plot(results)
    """
    # Make dictionary to store Parameters
    Param = {}
    Param['verbose'] = verbose
    Param['cluster'] = cluster
    Param['metric'] = metric
    Param['linkage'] = linkage
    Param['min_clust'] = min_clust
    Param['max_clust'] = max_clust
    Param['savemem'] = savemem
    if verbose>=3: print('[clusteval] >Evaluate using dbindex.')

    # Savemem for kmeans
    if Param['cluster']=='kmeans':
        if Param['savemem']:
            kmeansmodel=MiniBatchKMeans
            print('[clusteval] >Save memory enabled for kmeans.')
        else:
            kmeansmodel=KMeans

    # Cluster hierarchical using on metric/linkage
    if (Z is None) and (Param['cluster']!='kmeans'):
        Z = scipy_linkage(X, method=Param['linkage'], metric=Param['metric'])

    # Setup storing parameters
    clustcutt = np.arange(Param['min_clust'], Param['max_clust'])
    scores = np.zeros((len(clustcutt))) * np.nan
    dbclust = np.zeros((len(clustcutt))) * np.nan
    clustlabx = []

    # Run over all cluster cutoffs
    for i in tqdm(range(len(clustcutt))):
        # Cut the dendrogram for i clusters
        if Param['cluster']=='kmeans':
            labx=kmeansmodel(n_clusters=clustcutt[i], verbose=0).fit(X).labels_
        else:
            labx = fcluster(Z, clustcutt[i], criterion='maxclust')

        # Store labx for cluster-cut
        clustlabx.append(labx)
        # Store number of unique clusters
        dbclust[i]=len(np.unique(labx))
        # Compute silhouette (can only be done if more then 1 cluster)
        if dbclust[i]>1:
            scores[i]=_dbindex_score(X, labx)

    # Convert to array
    clustlabx = np.array(clustlabx)

    # Store only if agrees to restriction of input clusters number
    I1 = np.isnan(scores)==False
    I2 = dbclust>=Param['min_clust']
    I3 = dbclust<=Param['max_clust']
    Iloc = I1 & I2 & I3

    # Get only clusters of interest
    if len(Iloc)>0:
        scores = scores[Iloc]
        dbclust = dbclust[Iloc]
        clustlabx = clustlabx[Iloc, :]
        clustcutt = clustcutt[Iloc]
        idx = np.argmin(scores)
        clustlabx = clustlabx[idx, :] - 1
    else:
        if verbose>=3: print('[clusteval] >No clusters detected.')

    # Store results
    results = {}
    results['method'] = 'dbindex'
    results['score'] = pd.DataFrame(np.array([dbclust, scores]).T, columns=['clusters', 'score'])
    results['score'].clusters = results['score'].clusters.astype(int)
    results['labx'] = clustlabx
    results['fig'] = {}
    results['fig']['dbclust'] = dbclust
    results['fig']['scores'] = scores
    results['fig']['clustcutt'] = clustcutt

    # Return
    return(results)
Example #32
0
    def train(self):

        # Instantiate the dataset class
        data = dataset_badGAN(
            num_classes=F.num_classes,
            extraction_step=self.extraction_step,
            number_images_training=F.number_train_images,
            batch_size=F.batch_size,
            patch_shape=self.patch_shape,
            number_unlab_images_training=F.number_train_unlab_images,
            data_directory=F.data_directory)

        # Optimizer operations
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            d_optim = tf.train.AdamOptimizer(F.learning_rate_D, beta1=F.beta1D)\
                        .minimize(self.d_loss,var_list=self.d_vars)
            g_optim = tf.train.AdamOptimizer(F.learning_rate_G, beta1=F.beta1G)\
                        .minimize(self.g_loss,var_list=self.g_vars)
            if F.badGAN:
                e_optim = tf.train.AdamOptimizer(F.learning_rate_E, beta1=F.beta1E)\
                          .minimize(self.g_loss,var_list=self.e_vars)

        tf.global_variables_initializer().run()

        # Load checkpoints if required
        if F.load_chkpt:
            try:
                load_model(F.checkpoint_dir, self.sess, self.saver)
                print("\n [*] Checkpoint loaded succesfully!")
            except:
                print("\n [!] Checkpoint loading failed!")
        else:
            print("\n [*] Checkpoint load not required.")

        # Load the validation data
        patches_val, labels_val_patch, labels_val = preprocess_dynamic_lab(
            F.data_directory,
            F.num_classes,
            self.extraction_step,
            self.patch_shape,
            F.number_train_images,
            validating=F.training,
            testing=F.testing,
            num_images_testing=F.number_test_images)

        predictions_val = np.zeros((patches_val.shape[0], self.patch_shape[0],
                                    self.patch_shape[1], self.patch_shape[2]),
                                   dtype="uint8")
        max_par = 0.0
        max_loss = 100
        for epoch in xrange(int(F.epoch)):
            idx = 0
            batch_iter_train = data.batch_train()
            total_val_loss = 0
            total_train_loss_CE = 0
            total_train_loss_UL = 0
            total_train_loss_FK = 0
            total_gen_FMloss = 0

            for patches_lab, patches_unlab, labels in batch_iter_train:
                # Network update
                sample_z_gen = np.random.uniform(
                    -1, 1, [F.batch_size, F.noise_dim]).astype(np.float32)

                _ = self.sess.run(d_optim,
                                  feed_dict={
                                      self.patches_lab: patches_lab,
                                      self.patches_unlab: patches_unlab,
                                      self.z_gen: sample_z_gen,
                                      self.labels: labels,
                                      self.phase: True
                                  })

                if F.badGAN:
                    _, _ = self.sess.run(
                        [e_optim, g_optim],
                        feed_dict={
                            self.patches_unlab: patches_unlab,
                            self.z_gen: sample_z_gen,
                            self.z_gen: sample_z_gen,
                            self.phase: True
                        })
                else:
                    _ = self.sess.run(g_optim,
                                      feed_dict={
                                          self.patches_unlab: patches_unlab,
                                          self.z_gen: sample_z_gen,
                                          self.z_gen: sample_z_gen,
                                          self.phase: True
                                      })

                feed_dict = {
                    self.patches_lab: patches_lab,
                    self.patches_unlab: patches_unlab,
                    self.z_gen: sample_z_gen,
                    self.labels: labels,
                    self.phase: True
                }

                # Evaluate losses for plotting/printing purposes
                d_loss_lab = self.d_loss_lab.eval(feed_dict)
                d_loss_unlab_true = self.true_loss.eval(feed_dict)
                d_loss_unlab_fake = self.fake_loss.eval(feed_dict)
                g_loss_fm = self.g_loss_fm.eval(feed_dict)

                total_train_loss_CE = total_train_loss_CE + d_loss_lab
                total_train_loss_UL = total_train_loss_UL + d_loss_unlab_true
                total_train_loss_FK = total_train_loss_FK + d_loss_unlab_fake
                total_gen_FMloss = total_gen_FMloss + g_loss_fm

                idx += 1
                if F.badGAN:
                    vi_loss = self.vi_loss.eval(feed_dict)
                    print((
                        "Epoch:[%2d] [%4d/%4d] Labeled loss:%.2e Unlabeled loss:%.2e Fake loss:%.2e Generator FM loss:%.8f Generator VI loss:%.8f\n"
                    ) % (epoch, idx, data.num_batches, d_loss_lab,
                         d_loss_unlab_true, d_loss_unlab_fake, g_loss_fm,
                         vi_loss))
                else:
                    print((
                        "Epoch:[%2d] [%4d/%4d] Labeled loss:%.2e Unlabeled loss:%.2e Fake loss:%.2e Generator loss:%.8f \n"
                    ) % (epoch, idx, data.num_batches, d_loss_lab,
                         d_loss_unlab_true, d_loss_unlab_fake, g_loss_fm))

            # Save the curret model
            save_model(F.checkpoint_dir, self.sess, self.saver)

            avg_train_loss_CE = total_train_loss_CE / (idx * 1.0)
            avg_train_loss_UL = total_train_loss_UL / (idx * 1.0)
            avg_train_loss_FK = total_train_loss_FK / (idx * 1.0)
            avg_gen_FMloss = total_gen_FMloss / (idx * 1.0)

            print('\n\n')

            total_batches = int(patches_val.shape[0] / F.batch_size)
            print("Total number of batches for validation: ", total_batches)

            # Prediction of validation patches
            for batch in range(total_batches):
                patches_feed = patches_val[batch * F.batch_size:(batch + 1) *
                                           F.batch_size, :, :, :, :]
                labels_feed = labels_val_patch[batch *
                                               F.batch_size:(batch + 1) *
                                               F.batch_size, :, :, :]
                feed_dict = {
                    self.patches_lab: patches_feed,
                    self.labels: labels_feed,
                    self.phase: False
                }
                preds = self.Val_output.eval(feed_dict)
                val_loss = self.d_loss_lab.eval(feed_dict)

                predictions_val[batch * F.batch_size:(batch + 1) *
                                F.batch_size, :, :, :] = preds
                print(("Validated Patch:[%8d/%8d]") % (batch, total_batches))
                total_val_loss = total_val_loss + val_loss

            # To compute average patchvise validation loss(cross entropy loss)
            avg_val_loss = total_val_loss / (total_batches * 1.0)

            print("All validation patches Predicted")

            print("Shape of predictions_val, min and max:",
                  predictions_val.shape, np.min(predictions_val),
                  np.max(predictions_val))

            # To stitch back the patches into an entire image
            val_image_pred = recompose3D_overlap(predictions_val, 144, 192,
                                                 256, self.extraction_step[0],
                                                 self.extraction_step[1],
                                                 self.extraction_step[2])
            val_image_pred = val_image_pred.astype('uint8')

            print("Shape of Predicted Output Groundtruth Images:",
                  val_image_pred.shape, np.unique(val_image_pred),
                  np.unique(labels_val), np.mean(val_image_pred),
                  np.mean(labels_val))

            pred2d = np.reshape(val_image_pred,
                                (val_image_pred.shape[0] * 144 * 192 * 256))
            lab2d = np.reshape(labels_val,
                               (labels_val.shape[0] * 144 * 192 * 256))

            # For printing the validation results
            F1_score = f1_score(lab2d, pred2d, [0, 1, 2, 3], average=None)
            print("Validation Dice Coefficient.... ")
            print("Background:", F1_score[0])
            print("CSF:", F1_score[1])
            print("GM:", F1_score[2])
            print("WM:", F1_score[3])

            # To Save the best model
            if (max_par < (F1_score[2] + F1_score[3])):
                max_par = (F1_score[2] + F1_score[3])
                save_model(F.best_checkpoint_dir, self.sess, self.saver)
                print("Best checkpoint updated from validation results.")

            # To save the losses for plotting
            print("Average Validation Loss:", avg_val_loss)
            with open('Val_loss_GAN.txt', 'a') as f:
                f.write('%.2e \n' % avg_val_loss)
            with open('Train_loss_CE.txt', 'a') as f:
                f.write('%.2e \n' % avg_train_loss_CE)
            with open('Train_loss_UL.txt', 'a') as f:
                f.write('%.2e \n' % avg_train_loss_UL)
            with open('Train_loss_FK.txt', 'a') as f:
                f.write('%.2e \n' % avg_train_loss_FK)
            with open('Train_loss_FM.txt', 'a') as f:
                f.write('%.2e \n' % avg_gen_FMloss)
        return
Example #33
0
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-NN Classifier (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
Example #34
0
        self.nfft = nfft
        self.nfeat = nfeat
        self.rate = rate
        self.step = int(rate/10)
        

df = pd.read_csv('office_sounds.csv')
#print(df)
df.set_index('slice_file_name', inplace=True)

for f in df.index:
    rate, signal = wavfile.read('clean/'+f)
    df.at[f, 'length'] = signal.shape[0]/rate
print(df)

classes = list(np.unique(df.class_name))
class_dist = df.groupby(['class_name'])['length'].mean()

n_samples = 2*int(df['length'].sum()/0.1)
prob_dist = class_dist / class_dist.sum()
choices = np.random.choice(class_dist.index, p=prob_dist)

#print(choices)

df.reset_index(inplace=True)

config = Config(mode='conv')

if config.mode == 'conv':
    X, y = build_rand_feat()
    y_flat = np.argmax(y, axis=1)
Example #35
0
    def _frienemy_pruning(self, neighbors):
        """Implements the Online Pruning method (frienemy) to remove base
        classifiers that do not cross the region of competence. We consider
        that a classifier crosses the region of competence if it correctly
        classify at least one sample for each different class in the region.

        Returns
        -------
        DFP_mask : array of shape = [n_samples, n_classifiers]
                   Mask containing 1 for the selected base classifier and 0
                   otherwise.

        neighbors : array of shale = [n_samples, n_neighbors]
                    indices of the k nearest neighbors according to each
                    instance

        References
        ----------
        Oliveira, D.V.R., Cavalcanti, G.D.C. and Sabourin, R., Online Pruning
        of Base Classifiers for Dynamic Ensemble Selection,
        Pattern Recognition, vol. 72, December 2017, pp 44-58.
        """
        # using a for loop for processing a batch of samples temporarily.
        # Change later to numpy processing
        if neighbors.ndim < 2:
            neighbors = np.atleast_2d(neighbors)

        n_samples, _ = neighbors.shape
        mask = np.zeros((n_samples, self.n_classifiers_))

        for sample_idx in range(n_samples):
            # Check if query is in a indecision region
            neighbors_y = self.DSEL_target_[
                neighbors[sample_idx, :self.safe_k]]

            if len(set(neighbors_y)) > 1:
                # There are more than on class in the region of competence
                # (So it is an indecision region).

                # Check if the base classifier predict the correct label for
                # a sample belonging to each class.
                for clf_index in range(self.n_classifiers_):
                    predictions = self.DSEL_processed_[
                        neighbors[sample_idx, :self.safe_k], clf_index]
                    correct_class_pred = [self.DSEL_target_[index] for
                                          count, index in
                                          enumerate(neighbors[sample_idx,
                                                    :self.safe_k])
                                          if predictions[count] == 1]

                    # If that is true, it means that it correctly classified
                    # at least one neighbor for each class in
                    # the region of competence
                    if np.unique(correct_class_pred).size > 1:
                        mask[sample_idx, clf_index] = 1.0
                # Check if all classifiers were pruned
                if not np.count_nonzero(mask[sample_idx, :]):
                    # Do not apply the pruning mechanism.
                    mask[sample_idx, :] = 1.0

            else:
                # The sample is located in a safe region. All base classifiers
                # can predict the label
                mask[sample_idx, :] = 1.0

        return mask
 def max_instance(Y):
   unique, counts = np.unique(Y, return_counts=True)
   max_idx = np.argmax(counts)
   return unique[max_idx]
 def makemap(arr):
   tag = np.unique(arr)
   tag_map = Dict(zip(tag, range(len(tag))))
   return tag_map
 def entropy(y):
   N = len(y)
   count = np.array([len(y[y == k]) for k in np.unique(y)])
   entro = -np.sum((count / N) * (np.log2(count / N)))
   return entro
  print(raw_traj.shape, raw_pixel.shape)
  raw_traj = np.delete(raw_traj, [3, 6], axis=1)
  raw_pixel = np.delete(raw_pixel, [3, 6], axis=1)
  print(raw_traj.shape, raw_pixel.shape)
  raw_video = cv2.VideoCapture(os.path.join(dataset['data_path'], 'video.avi'))
  # raw_video.set(cv2.CAP_PROP_POS_FRAMES, 1000)
  # flag, image = raw_video.read()

  h_matrix = np.genfromtxt(os.path.join(dataset['data_path'], 'H.txt'))
  obs_len = params['history_num_frames']
  pred_len = params['future_num_frames']
  seq_len = obs_len + pred_len
  print(obs_len, pred_len, seq_len)

  frames = np.unique(raw_traj[:, 0]).tolist()
  frame_data = []
  frame_data_pixel = []
  for frame in frames:
    frame_data.append(raw_traj[frame == raw_traj[:, 0], :])  # 从data中按frame_id重排数据到frame_data
    frame_data_pixel.append(raw_pixel[frame == raw_pixel[:, 0], :])
  num_sequences = int(math.ceil(len(frames) - seq_len + 1))  # 总的帧数中去掉seq_len为总切片数
  i = 0
  for idx in range(0, num_sequences + 1):  # 遍历本数据集,抽取每帧所有行人数据
    curr_seq_data = np.concatenate(frame_data[idx:idx + seq_len], axis=0)  # 抽取从当前帧算起向后共seq_len帧数据
    curr_seq_data_pixel = np.concatenate(frame_data_pixel[idx:idx + seq_len], axis=0)
    curr_frame = frame_data[idx+obs_len][0][0]
    peds_in_curr_seq = np.unique(curr_seq_data[:, 1])  # 抽取当前帧中所有行人id并去重
    # 按行人遍历seq_len帧的frame数据,抽取每位行人的坐标
    for _, ped_id in enumerate(peds_in_curr_seq):
      curr_ped_seq = curr_seq_data[curr_seq_data[:, 1] == ped_id, :]  # 抽取同一行人id的轨迹到curr_ped_seq
Example #40
0
def main_clf(metric_,
             clf_,
             grid_,
             range_=(2, 7),
             cv_=5,
             verb_=False,
             graphs=False):
    pipe = Pipeline(steps=[('sc', StandardScaler()), ('clf', clf_)])
    max_scoring = 0
    for k in range(*range_):
        denue_wide = pd.read_csv(f"summary/Count/denue_wide_{k}.csv")  ###
        rezago = pd.read_csv("rezago_social/rezago_social.csv")
        rezago_social = rezago[[
            "lgc00_15cl3_2", "Key", "POB_TOTAL", "LAT", "LON"
        ]]
        df = pd.merge(rezago_social, denue_wide, on=['Key'])
        y = rezago_social['lgc00_15cl3_2']
        df.drop(["lgc00_15cl3_2", "Key", "LAT", "LON"], axis=1, inplace=True)
        X = df.div(df.POB_TOTAL, axis=0) * 1000
        X.drop(["POB_TOTAL"], axis=1, inplace=True)
        X["LAT"] = rezago_social["LAT"]
        X["LON"] = rezago_social["LON"]
        print(f'# CLF {k} {X.shape}')
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            stratify=y,
                                                            test_size=0.20,
                                                            random_state=0)
        clf_cv = GridSearchCV(pipe,
                              grid_,
                              cv=cv_,
                              scoring=metric_,
                              verbose=verb_)  # cv_
        clf_cv.fit(X_train, y_train)
        if np.mean(clf_cv.best_score_) > max_scoring:
            max_scoring = clf_cv.best_score_
            print(f"\t # {k} CLF {clf_cv.best_score_} {clf_cv.best_params_}")
            best_params = clf_cv.best_params_
            best_k = k
            Xtrain, ytrain = X_train, y_train
            Xtest, ytest = X_test, y_test
            X_, y_ = X, y
    best_params_ = {k[5:]: v for k, v in best_params.items()}
    best_clf = clf_.set_params(**best_params_)
    best_pipe = Pipeline(steps=[('sc', StandardScaler()), ('clf', best_clf)])
    print('#BEST', best_pipe, max_scoring)
    best_pipe.fit(Xtrain, ytrain)
    print(f"# {best_k}: Train:{best_pipe.score(Xtrain, ytrain) * 100}")
    print(f"# {best_k}: Test:{best_pipe.score(Xtest, ytest) * 100}")
    scores = cross_val_score(best_pipe,
                             X_,
                             y_,
                             cv=cv_,
                             n_jobs=-1,
                             scoring='accuracy')
    print(f"# {best_k}: Accuracy CV5:{np.mean(scores)} +/- {np.std(scores)}")
    scores_ = cross_val_score(best_pipe,
                              X_,
                              y_,
                              cv=cv_,
                              n_jobs=-1,
                              scoring=metric_)
    print(
        f"# {best_k}: {metric_} CV5:{np.mean(scores_)} +/- {np.std(scores_)}")
    y_pred = cross_val_predict(best_pipe, X_, y_, cv=cv_)
    print(classification_report(y_, y_pred, digits=3))

    print(np.unique(np.array(y_pred), return_counts=True))

    if graphs:
        # plot_multiclass_roc(best_pipe, X_, y_, n_classes=3, figsize=(16, 10))
        probas = cross_val_predict(best_pipe,
                                   X_,
                                   y_,
                                   cv=cv_,
                                   method='predict_proba')
        fig, (ax1, ax2) = plt.subplots(1, 2)
        skplt.metrics.plot_roc(y_, probas, ax=ax1, title='')
        handles, labels = ax1.get_legend_handles_labels()
        # print(labels)
        labels = [
            lb.replace(' 1 ', ' A ').replace(' 2 ',
                                             ' M ').replace(' 3 ', ' B ')
            for lb in labels
        ]
        # print(labels)
        ax1.legend(handles, labels)
        ax1.get_figure()
        ax1.set_xlabel('TFP\n(A)')
        skplt.metrics.plot_precision_recall(y_, probas, ax=ax2, title='')
        handles, labels = ax2.get_legend_handles_labels()
        # print(labels)
        labels = [
            lb.replace(' 1 ', ' A ').replace(' 2 ',
                                             ' M ').replace(' 3 ', ' B ')
            for lb in labels
        ]
        # print(labels)
        ax2.legend(handles, labels)
        ax2.get_figure()
        ax2.set_xlabel('S\n(B)')
        plt.show()

        ### 2016
        denue_2016 = pd.read_csv(
            f"summary/201610/denue_wide_{best_k}.csv")  ###
        df_2016 = pd.merge(rezago_social, denue_2016, on=['Key'])
        df_2016.drop(["lgc00_15cl3_2", "Key", "LAT", "LON"],
                     axis=1,
                     inplace=True)
        X_2016 = df_2016.div(df.POB_TOTAL, axis=0) * 1000
        X_2016.drop(["POB_TOTAL"], axis=1, inplace=True)
        X_2016["LAT"] = rezago_social["LAT"]
        X_2016["LON"] = rezago_social["LON"]
        print(X_2016.columns)
        y_pred_2016 = best_pipe.predict(X_2016)
        ### 2017
        denue_2017 = pd.read_csv(
            f"summary/201711/denue_wide_{best_k}.csv")  ###
        df_2017 = pd.merge(rezago_social, denue_2017, on=['Key'])
        df_2017.drop(["lgc00_15cl3_2", "Key", "LAT", "LON"],
                     axis=1,
                     inplace=True)
        X_2017 = df_2017.div(df.POB_TOTAL, axis=0) * 1000
        X_2017.drop(["POB_TOTAL"], axis=1, inplace=True)
        X_2017["LAT"] = rezago_social["LAT"]
        X_2017["LON"] = rezago_social["LON"]
        y_pred_2017 = best_pipe.predict(X_2017)
        # ### 2018
        # denue_2018 = pd.read_csv(f"summary/201811/denue_wide_{best_k}.csv")  ###
        # df_2018 = pd.merge(rezago_social, denue_2018, on=['Key'])
        # df_2018.drop(["lgc00_15cl3", "Key", "LAT", "LON"], axis=1, inplace=True)
        # X_2018 = df_2018.div(df.POB_TOTAL, axis=0) * 1000
        # X_2018.drop(["POB_TOTAL"], axis=1, inplace=True)
        # X_2018["LAT"] = rezago_social["LAT"]
        # X_2018["LON"] = rezago_social["LON"]
        # y_pred_2018 = best_pipe.predict(X_2018)
        # ### 2019
        # denue_2019 = pd.read_csv(f"summary/201911/denue_wide_{best_k}.csv")  ###
        # df_2019 = pd.merge(rezago_social, denue_2019, on=['Key'])
        # df_2019.drop(["lgc00_15cl3", "Key", "LAT", "LON"], axis=1, inplace=True)
        # X_2019 = df_2019.div(df.POB_TOTAL, axis=0) * 1000
        # X_2019.drop(["POB_TOTAL"], axis=1, inplace=True)
        # X_2019["LAT"] = rezago_social["LAT"]
        # X_2019["LON"] = rezago_social["LON"]
        # y_pred_2019 = best_pipe.predict(X_2019)
        # ### 2020
        # denue_2020 = pd.read_csv(f"summary/202011/denue_wide_{best_k}.csv")  ###
        # df_2020 = pd.merge(rezago_social, denue_2020, on=['Key'])
        # df_2020.drop(["lgc00_15cl3", "Key", "LAT", "LON"], axis=1, inplace=True)
        # X_2020 = df_2020.div(df.POB_TOTAL, axis=0) * 1000
        # X_2020.drop(["POB_TOTAL"], axis=1, inplace=True)
        # X_2020["LAT"] = rezago_social["LAT"]
        # X_2020["LON"] = rezago_social["LON"]
        # y_pred_2020 = best_pipe.predict(X_2020)
        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(y_,
                                            y_pred,
                                            normalize=True,
                                            title=" ")
        plt.xticks([0, 1, 2], ['B', 'M', 'A'], rotation='horizontal')
        plt.yticks([0, 1, 2], ['B', 'M', 'A'], rotation='horizontal')
        plt.xlabel('Clases predichas')
        plt.ylabel('Clases verdaderas')
        plt.show()
        # Mapa
        rezago_social['Pred'] = y_pred
        rezago_social['Pred_2016'] = y_pred_2016
        rezago_social['Pred_2017'] = y_pred_2017
        # rezago_social['Pred_2018'] = y_pred_2018
        # rezago_social['Pred_2019'] = y_pred_2019
        # rezago_social['Pred_2020'] = y_pred_2020
        rezago_social.to_csv('predictions.csv')  ###
        rezago_social['Key_'] = rezago_social['Key'].astype(str).str.zfill(5)
        gdf = gpd.read_file('municipios/areas_geoestadisticas_municipales.shp')
        gdf['Key_'] = gdf['CVE_ENT'] + gdf['CVE_MUN']
        gdf = gdf.merge(rezago_social, on='Key_')
        legend_elements = [
            Line2D(
                [0],
                [0],
                marker='o',
                color='w',
                label='B',
                markerfacecolor='g',
                markersize=10,
            ),
            Line2D([0], [0],
                   marker='o',
                   color='w',
                   label='M',
                   markerfacecolor='yellow',
                   markersize=10),
            Line2D([0], [0],
                   marker='o',
                   color='w',
                   label='A',
                   markerfacecolor='r',
                   markersize=10)
        ]
        csfont = {'fontname': 'Times New Roman'}
        font = font_manager.FontProperties(family='Times New Roman',
                                           weight='normal',
                                           style='normal',
                                           size=12)
        colors = {3: 'green', 2: 'yellow', 1: 'red'}
        models = {
            'RandomForestClassifier': 'RF',
            'SCV': 'SVM',
            'LogisticRegression': 'LR'
        }
        ###
        # gdf.plot(color=gdf['Pred_2016'].map(colors))
        # plt.xticks([])
        # plt.yticks([])
        # txt = f"Categorías predichas por modelo {models.get(clf.__class__.__name__, 'ABC')}, para el año 201X."
        # plt.text(800000, 0.01, txt, wrap=True, horizontalalignment='left', fontsize=12, **csfont)
        # plt.legend(handles=legend_elements, prop=font)
        # plt.show()
        ### Mapa
        fig, (ax1, ax2) = plt.subplots(1, 2)
        gdf.plot(ax=ax1, color=gdf['Pred_2016'].map(colors))
        ax1.set_xticks([])
        ax1.set_yticks([])
        # txt = f"(A) Clases predichas con modelo {models.get(clf.__class__.__name__, 'ABC')} en 2016"
        ax1.set_xlabel("(A)", **csfont)
        # ax1.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont)
        ax1.legend(handles=legend_elements, prop=font)
        gdf.plot(ax=ax2, color=gdf['Pred_2017'].map(colors))
        ax2.set_xticks([])
        ax2.set_yticks([])
        # txt = f"(B) Clases predichas con modelo {models.get(clf.__class__.__name__, 'ABC')} en 2017"
        ax2.set_xlabel("(B)", **csfont)
        # ax2.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont)
        ax2.legend(handles=legend_elements, prop=font)
        plt.show()

        ### Mapa
        fig, (ax1, ax2) = plt.subplots(1, 2)
        gdf.plot(ax=ax1, color=gdf['lgc00_15cl3_2'].map(colors), legend=True)
        ax1.set_xticks([])
        ax1.set_yticks([])
        # txt = "(A) Clases de acuerdo a Valdés-Cruz y Vargas-Chanes (2017)"
        ax1.set_xlabel("(A)", **csfont)
        # ax1.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont)
        ax1.legend(handles=legend_elements, prop=font)
        gdf.plot(ax=ax2, color=gdf['Pred'].map(colors))
        ax2.set_xticks([])
        ax2.set_yticks([])
        # txt = f"(B) Clases predichas con modelo {models.get(clf.__class__.__name__, 'ABC')} en 2015"
        ax2.set_xlabel("(B)", **csfont)
        # ax2.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont)
        ax2.legend(handles=legend_elements, prop=font)
        plt.show()
        # Curva ROC
        y_bin = label_binarize(y, classes=[1, 2, 3])
        n_classes = y_bin.shape[1]
        y_score = cross_val_predict(best_pipe,
                                    X_,
                                    y_,
                                    cv=cv_,
                                    method='predict_proba')
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
        mean_tpr /= n_classes
        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
        plt.figure()
        plt.plot(fpr["macro"],
                 tpr["macro"],
                 label='ROC macro (AUC = {0:0.3f})'
                 ''.format(roc_auc["macro"]),
                 color='navy',
                 linestyle=':',
                 linewidth=4)
        rezago = {1: 'B', 2: 'M', 3: 'A'}
        colors = cycle(['green', 'yellow', 'red'])
        for i, color in zip(range(n_classes), colors):
            plt.plot(fpr[i],
                     tpr[i],
                     color=color,
                     lw=2,
                     label='Clase de rezago {0} (AUC = {1:0.3f})'
                     ''.format(rezago[i + 1], roc_auc[i]))
        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlim([-0.05, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('TFP', fontsize=12, **csfont)
        plt.ylabel('TVP', fontsize=12, **csfont)
        plt.legend(loc="lower right", prop=font)
        plt.show()
    return scores_
Example #41
0
sizes_after_trim = list()

targets = list()
cutoffs = list()
aves = list()
sizes = list()

for _ in tqdm(range(400)):
    #choose a random target:
    idx = np.random.choice(y_.shape[1])

    #choose a random cluster size upper limit and cluster:
    clusterSize = np.random.randint(200, 10000)
    clusterer.balanced_cut(clusterSize)

    clabels = np.unique(clusterer.labels_)
    pos_labels = np.unique(clusterer.labels_[y_[:, idx] == 1])
    neg_labels = clabels[~np.isin(clabels, pos_labels)]
    if min(len(pos_labels), len(neg_labels)) < 2:
        print('Not enough positive clusters to split')
        continue

    test_clusters, train_clusters = utils.split_clusters(pos_labels,
                                                         neg_labels,
                                                         0.2, [0.1, 0.1],
                                                         shuffle=True)

    actives_test_idx, actives_train_idx, inactives_test_idx, inactives_train_idx = utils.get_four_matrices(
        y_, idx, clusterer, test_clusters, train_clusters)
    print(actives_test_idx.shape[0], actives_train_idx.shape[0],
          inactives_test_idx.shape[0], inactives_train_idx.shape[0])
Example #42
0
def perm_test_cyto_main(cross_matrix, event, data_all, rand_p, FLAGS):
    """
    This funtion returns p-values of common cytobands between cross-cancer patients and patients similar to them as a result of percnvation test.
    
    .... 
    cross_matrix: DataFrame of cross-cancer patients.
    event: amplification or deletion event.
    rand_p: List of the indices of random patients drawn in patients of cancer type of the cross-cancer patient.  
    ....
    
    Output
    all_p: List of dataframes that include p-values of common cytobands in cross-cancer patients.
    """

    all_p = []

    for i, l in zip(cross_matrix.index, range(len(cross_matrix.index.values))):

        can1 = cross_matrix.loc[i, 'Cross-cancer Type']
        can2 = cross_matrix.loc[i, 'Cancer Type of Patients Similar to']
        pairs = cross_matrix.loc[i, "Patients Similar to"]
        pairs = ast.literal_eval(pairs)

        #preprocess of cnv data
        can1_cnv = prep_cytobands(can1, event, data_all, FLAGS)
        can2_cnv = prep_cytobands(can2, event, data_all, FLAGS)

        p_values = []
        obs_stat = []
        common_cnv = []

        if i in can1_cnv.index:

            main_cnv = np.unique(can1_cnv.loc[i, 'Cytoband'])

            if len(pairs) == 1:

                if pairs[0] in can2_cnv.index:

                    pairs_df = can2_cnv[can2_cnv.index.isin([pairs[0]])]
                    common_cnv = list(
                        set(main_cnv).intersection(
                            set(np.unique(pairs_df['Cytoband']))))

                    for j in common_cnv:

                        sub = pairs_df[pairs_df['Cytoband'] == j]
                        o = 1
                        p_value = perm_test_cyto(o, j, main_cnv, can1_cnv,
                                                 rand_p[l], FLAGS)
                        p_values.append(p_value)
                        obs_stat.append(o)

                        print(
                            'P-value of cytoband ' + str(j) + ' in patient ' +
                            str(i) + ':', p_value)

            else:
                rem = list(set(pairs) - set(can2_cnv.index.unique()))

                for r in rem:
                    while r in pairs:
                        pairs.remove(r)

                n = len(pairs)

                pairs_df = can2_cnv[can2_cnv.index.isin(pairs)][[
                    'Cytoband', 'patient'
                ]]
                pairs_df = pairs_df.drop_duplicates()
                gens = list(
                    set(main_cnv).intersection(
                        set(np.unique(pairs_df['Cytoband']))))

                for j in gens:

                    sub = pairs_df[pairs_df['Cytoband'] == j]
                    pts = np.unique(sub['patient'])

                    if len(pts) >= 0.7 * n:

                        common_cnv.append(j)
                        o = len(pts)
                        p_value = perm_test_cyto(o, j, main_cnv, can1_cnv,
                                                 rand_p[l], FLAGS)
                        p_values.append(p_value)
                        obs_stat.append(o)

                        print(
                            'P-value of cytoband ' + str(j) + ' in patient ' +
                            str(i) + ':', p_value)

        p = pd.DataFrame({
            'Cytobands': common_cnv,
            'NumOfPatientswithCytoband': obs_stat,
            'p-value': p_values
        })
        if not p.empty:
            p['Cross-cancer Patient'] = i
            all_p.append(p)

    return all_p
Example #43
0
h, w = config.IMAGE_SHAPE[:2]
proposals = np.around(mrcnn["proposals"][0] * np.array([h, w, h, w])).astype(np.int32)

# Class ID, score, and mask per proposal
roi_class_ids = np.argmax(mrcnn["probs"][0], axis=1)
roi_scores = mrcnn["probs"][0, np.arange(roi_class_ids.shape[0]), roi_class_ids]
roi_class_names = np.array(dataset_val.class_names)[roi_class_ids]
roi_positive_ixs = np.where(roi_class_ids > 0)[0]

# How many ROIs vs empty rows?
print("{} Valid proposals out of {}".format(np.sum(np.any(proposals, axis=1)), proposals.shape[0]))
print("{} Positive ROIs".format(len(roi_positive_ixs)))

# Class counts
print(list(zip(*np.unique(roi_class_names, return_counts=True))))

limit = 200
ixs = np.random.randint(0, proposals.shape[0], limit)
captions = ["{} {:.3f}".format(dataset_val.class_names[c], s) if c > 0 else ""
            for c, s in zip(roi_class_ids[ixs], roi_scores[ixs])]
visualize.draw_boxes(image, boxes=proposals[ixs],
                     visibilities=np.where(roi_class_ids[ixs] > 0, 2, 1),
                     captions=captions, title="ROIs Before Refinement",
                     ax=get_ax())
roi_bbox_specific = mrcnn["deltas"][0, np.arange(proposals.shape[0]), roi_class_ids]
log("roi_bbox_specific", roi_bbox_specific)

# Apply bounding box transformations
# Shape: [N, (y1, x1, y2, x2)]
refined_proposals = utils.apply_box_deltas(
Example #44
0
def linmixmod(xs, treatment, timeunit, model='lmm', RCMD=cran.rcmd):
    '''
    Linear Mixed-Effects Model computation for one fixed effect and one 
    random effect.
    This function uses the R packages "lme4" and "stats".

    The response variable is modeled using two linear mixed effect models 
    (Model and Nullmodel) of the form:
    - xs~treatment+(1+treatment|timeunit)
      (Random intercept + random slope model)
    - xs~(1+treatment|timeunit)
      (Nullmodel without the fixed effect "treatment")

    Both models are compared in R using "anova" (from the R-package "stats")
    which performs a likelihood ratio test to obtain the p-Value for the
    significance of the fixed effect (treatment).

    Optionally differential deformations are computed which are then used in the
    Linear Mixed Model

    Parameters
    ----------
    xs: list of multiple 1D ndarrays
        Each index of `xs` contains an array of response variables.
        (eg. list containing "area_um" data of several measurements)
    treatment: list
        Each item is a description/identifier for a treatment. The
        enumeration matches the index of `xs`.
        treatment[i] can be 'Control', 'Treatment', 'Reservoir Control' or 
        'Reservoir Treatment'. If 'Reservoir ...' is chosen, the algorithm
        will perform a bootstrapping algorithm that removes the median from each
        Channel measurement. That means for each 'Control' or 'Treatment' has to exist
        a 'Reservoir ...' measurement. The resulting Differential deformations
        are then used in the Linear Mixed Model
    timeunit: list
        Each item is a description/identifier for a time. The
        enumeration matches the index of `xs`.
        (e.g. list containing integers "1" and "2" according to the day
        at which the content in `xs` was measured) 
    model: string
        'lmm': A linear mixed model will be applied
        'glmm': A generalized linear mixed model will be applied
    Returns
    -------
    (Generalized) Linear Mixed Effects Model Result: dictionary
    The dictionary contains:
    -Estimate:  the average value of cells that had Treatment 1
    -Fixed Effect: Change of the estimate value due to the Treatment 2
    -Std Error for the Estimate
    -Std Error for the Fixed Effect
    -p-Value

    References
    ----------
    .. [1] R package "lme4":
           Bates D, Maechler M, Bolker B and Walker S (2015). lme4: Linear mixed-
           effects models using Eigen and S4. R package version 1.1-9, 
           https://CRAN.R-project.org/package=lme4.    

    .. [2] R function "anova" from package "stats":
           Chambers, J. M. and Hastie, T. J. (1992) Statistical Models in S, 
           Wadsworth & Brooks/Cole

    Examples
    -------
    import numpy as np
    import pyper
    from nptdms import TdmsFile
    import os

    xs = [
    [100,99,80,120,140,150,100,100,110,111,140,145], #Larger values (Channel1)
    [20,10,5,16,14,22,27,26,5,10,11,8,15,17,20,9], #Smaller values (Reservoir1)
    [115,110,90,110,145,155,110,120,115,120,120,150,100,90,100], #Larger values (Channel2)
    [30,30,15,26,24,32,37,36,15,20,21,18,25,27,30,19], #Smaller values (Reservoir2)
    [150,150,130,170,190,250,150,150,160,161,180,195,130,120,125,130,125],
    [2,1,5,6,4,2,7,6,5,10,1,8,5,7,2,9,11,8,13],
    [155,155,135,175,195,255,155,155,165,165,185, 200,135,125,130,135,140,150,135,140],
    [25,15,19,26,44,42,35,20,15,10,11,28,35,10,25,13]] 
    treatment1 = ['Control', 'Reservoir Control', 'Control', 'Reservoir Control',\
    'Treatment', 'Reservoir Treatment','Treatment', 'Reservoir Treatment']
    timeunit1 = [1, 1, 2, 2, 1, 1, 2, 2]

    #Example 1: linear mixed models on differential deformations
    Result_1 = linmixmod(xs=xs,treatment=treatment1,timeunit=timeunit1,model='lmm')

    #Result_1:Estimate=93.69375 (i.e. the average Control value is 93.69)
    #         FixedEffect=43.93 (i.e. The treatment leads to an increase)         
    #         p-Value(Likelihood Ratio Test)=0.0006026 (i.e. the increase is significant)

    #Example 2: Ordinary Linear mixed models
    #'Reservoir' measurements are now Controls
    #'Channel' measurements are Treatments
    #This does not use differential deformation in linmixmod()
    treatment2 = ['Treatment', 'Control', 'Treatment', 'Control',\
    'Treatment', 'Control','Treatment', 'Control']
    timeunit2 = [1, 1, 2, 2, 3, 3, 4, 4]
    Result_2 = linmixmod(xs=xs,treatment=treatment2,timeunit=timeunit2,model='lmm')

    #Result_2:Estimate=17.17 (i.e. the average Control value is 17.17 )
    #         FixedEffect=120.257 (i.e. The treatment leads to an increase)         
    #         p-Value(Likelihood Ratio Test)=0.00033 (i.e. the deformation
    #         increases significantly)

    #Example 3: Generalized Linear mixed models
    treatment3 = ['Treatment', 'Control', 'Treatment', 'Control',\
    'Treatment', 'Control','Treatment', 'Control']
    timeunit3 = [1, 1, 2, 2, 3, 3, 4, 4]    
    Result_3 = linmixmod(xs=xs,treatment=treatment3,timeunit=timeunit3,model='glmm')

    #Result_3:Estimate=2.71 (i.e. the average Control value is exp(2.71)=15.08)
    #         FixedEffect=2.19 (i.e. The treatment leads to an increase)         
    #         p-Value(Likelihood Ratio Test)=0.00366 (i.e. the deformation
    #         increases significantly)     
    '''

    modelfunc = "xs~treatment+(1+treatment|timeunit)"
    nullmodelfunc = "xs~(1+treatment|timeunit)"

    # Check if all input lists have the same length
    if len(xs) != len(treatment) or len(xs) != len(timeunit):
        msg = "`treatment` and `timeunit` not defined for all variables!"
        raise ValueError(msg)
        
    if len(xs) < 3:
        msg = "Linear Mixed Models require repeated measurements. " +\
              "Please select more treatment repetitions."
        raise ValueError(msg)

    ######################Differential Deformation#############################
    # If the user selected 'Control-Reservoir' and/or 'Treatment-Reservoir'
    Median_DiffDef = []
    TimeUnit, Treatment = [], []
    if 'Reservoir Control' in treatment or 'Reservoir Treatment' in treatment:
        if model == 'glmm':
            Head_string = "GENERALIZED LINEAR MIXED MODEL ON BOOTSTAP-DISTRIBUTIONS: \n" +\
                "---Results are in log space (loglink was used)--- \n"
        if model == 'lmm':
            Head_string = "LINEAR MIXED MODEL ON BOOTSTAP-DISTRIBUTIONS: \n"
        # Find the timeunits for Control
        where_contr_ch = np.where(np.array(treatment) == 'Control')
        timeunit_contr_ch = np.array(timeunit)[where_contr_ch]
        # Find the timeunits for Treatment
        where_treat_ch = np.where(np.array(treatment) == 'Treatment')
        timeunit_treat_ch = np.array(timeunit)[where_treat_ch]

        for n in np.unique(timeunit_contr_ch):
            where_time = np.where(np.array(timeunit) == n)
            xs_n = np.array(xs)[where_time]
            treatment_n = np.array(treatment)[where_time]
            where_contr_ch = np.where(np.array(treatment_n) == 'Control')
            xs_n_contr_ch = xs_n[where_contr_ch]
            where_contr_res = np.where(
                np.array(treatment_n) == 'Reservoir Control')
            xs_n_contr_res = xs_n[where_contr_res]

            # check that corresponding Controls are selected
            if (len(where_contr_ch[0]) != 1 or
                len(where_contr_res[0]) != 1):
                msg = "Controls for channel and reservoir must be given" \
                      +" exactly once (repetition {})!".format(n)
                raise ValueError(msg)

            # Apply the Bootstraping algorithm to Controls
            y = np.array(xs_n_contr_ch)[0]
            yR = np.array(xs_n_contr_res)[0]
            [Median, MedianR] = diffdef(y, yR)
            Median_DiffDef.append(Median - MedianR)
            # TimeUnit is a number for the day or the number of the repeat
            TimeUnit.extend(np.array(n).repeat(len(Median)))
            Treatment.extend(np.array(['Control']).repeat(len(Median)))

        for n in np.unique(timeunit_treat_ch):
            where_time = np.where(np.array(timeunit) == n)
            xs_n = np.array(xs)[where_time]
            treatment_n = np.array(treatment)[where_time]
            xs_n_contr_res = xs_n[where_contr_res]
            where_treat_ch = np.where(np.array(treatment_n) == 'Treatment')
            xs_n_treat_ch = xs_n[where_treat_ch]
            where_treat_res = np.where(
                np.array(treatment_n) == 'Reservoir Treatment')
            xs_n_treat_res = xs_n[where_treat_res]

            # check that corresponding Treatments are selected
            if (len(where_treat_ch[0]) != 1 or
                len(where_treat_res[0]) != 1):
                msg = "Treatments for channel and reservoir must be given" \
                      +" exactly once (repetition {})!".format(n)
                raise ValueError(msg)

            # Apply the Bootstraping algorithm to Treatments
            y = np.array(xs_n_treat_ch)[0]
            yR = np.array(xs_n_treat_res)[0]
            [Median, MedianR] = diffdef(y, yR)
            Median_DiffDef.append(Median - MedianR)
            # TimeUnit is a number for the day or the number of the repeat
            TimeUnit.extend(np.array(n).repeat(len(Median)))
            Treatment.extend(np.array(['Treatment']).repeat(len(Median)))

        # Concat all elements in the lists
        xs = np.concatenate(Median_DiffDef)
        xs = np.array(xs).ravel()
        treatment = np.array(Treatment)
        timeunit = np.array(TimeUnit)

    else:  # If there is no 'Reservoir Channel' selected dont apply bootstrapping
        if model == 'glmm':
            Head_string = "GENERALIZED LINEAR MIXED MODEL: \n" +\
                "---Results are in log space (loglink was used)--- \n"
        if model == 'lmm':
            Head_string = "LINEAR MIXED MODEL: \n"

        for i in range(len(xs)):
            # Expand every unit in treatment and timeunit to the same length as the
            # xs[i] they are supposed to describe
            # Using the "repeat" function also characters can be handled
            treatment[i] = np.array([treatment[i]]).repeat(len(xs[i]), axis=0)
            timeunit[i] = np.array([timeunit[i]]).repeat(len(xs[i]), axis=0)

        # Concat all elements in the lists
        xs = np.concatenate(xs)
        treatment = np.concatenate(treatment)
        timeunit = np.concatenate(timeunit)

    # Open a pyper instance
    r1 = pyper.R(RCMD=RCMD, use_pandas=True)
    # try to fix unicode decode errors by forcing english
    r1('Sys.setenv(LANG = "en")')
    r1.assign("xs", xs)
    # Transfer the vectors to R
    r1.assign("treatment", treatment)
    r1.assign("timeunit", timeunit)
    # Create a dataframe which contains all the data
    r1("RTDC=data.frame(xs,treatment,timeunit)")
    # Load the necessary library for Linear Mixed Models
    lme4resp = r1("library(lme4)").decode("utf-8")
    if lme4resp.count("Error"):
        # Tell the user that something went wrong
        raise OSError("R installation at {}: {}\n".format(RCMD, lme4resp) +
                      """Please install 'lme4' via:
              {} -e "install.packages('lme4', repos='http://cran.r-project.org')
              """.format(RCMD)
                      )

    # Random intercept and random slope model
    if model == 'glmm':
        r1("Model = glmer(" + modelfunc + ",RTDC,family=Gamma(link='log'))")
        r1("NullModel = glmer(" + nullmodelfunc + ",RTDC,family=Gamma(link='log'))")
    if model == 'lmm':
        r1("Model = lmer(" + modelfunc + ",RTDC)")
        r1("NullModel = lmer(" + nullmodelfunc + ",RTDC)")

    r1("Anova = anova(Model,NullModel)")
    Model_string = r1("summary(Model)").decode("utf-8").split("\n", 1)[1]
    Anova_string = r1("Anova").decode("utf-8").split("\n", 1)[1]
    Coef_string = r1("coef(Model)").decode("utf-8").split("\n", 2)[2]
    # Cleanup output
    Coef_string = Coef_string.replace('attr(,"class")\n', '')
    Coef_string = Coef_string.replace('[1] "coef.mer"\n', '')
    #"anova" from R does a likelihood ratio test which gives a p-Value
    p = np.array(r1.get("Anova$Pr[2]"))

    # Obtain p-Value using a normal approximation
    # Extract coefficients
    r1("coefs <- data.frame(coef(summary(Model)))")
    r1("coefs$p.normal=2*(1-pnorm(abs(coefs$t.value)))")

    # Convert to array, depending on platform or R version, this is a DataFrame
    # or a numpy array, so we convert it to an array. Because on Windows the
    # result is an array with subarrays of type np.void, we must access the
    # elements with Coeffs[0][0] instead of Coeffs[0,0].
    Coeffs = np.array(r1.get("coefs"))
    # The Average value of treatment 1
    Estimate = Coeffs[0][0]
    # The Std Error of the average value of treatment 1
    StdErrorEstimate = Coeffs[0][1]
    # treatment 2 leads to a change of the Estimate by the value "FixedEffect"
    FixedEffect = Coeffs[1][0]
    StdErrorFixEffect = Coeffs[1][1]

    # Before getting effect and error for y, transform back (there happened a log transformation in the glmer)
    estim_y = np.exp(Estimate)
    #estim_y_error = abs(np.exp(Estimate+StdErrorEstimate)-np.exp(Estimate-StdErrorEstimate))
    fixef_y = np.exp(Estimate + FixedEffect) - np.exp(Estimate)
    #fixef_y_error = abs(np.exp(Estimate+StdErrorFixEffect)-np.exp(Estimate-StdErrorFixEffect))

    full_summary = Head_string + Model_string +\
        "\nCOEFFICIENT TABLE:\n" + Coef_string +\
        "\nLIKELIHOOD RATIO TEST (MODEL VS.  NULLMODEL): \n" +\
        Anova_string

    if model == "glmm":
        full_summary += "\nESTIMATE AND EFFECT TRANSFORMED BACK FROM LOGSPACE" +\
                        "\nEstimate = \t" + str(estim_y) +\
                        "\nFixed effect = \t" + str(fixef_y)

    results = {"Full Summary": full_summary,
               "p-Value (Likelihood Ratio Test)": p,
               "Estimate": Estimate,
               "Std. Error (Estimate)": StdErrorEstimate,
               "Fixed Effect": FixedEffect,
               "Std. Error (Fixed Effect)": StdErrorFixEffect}
    return results
Example #45
0
                     low_memory=False)
    # Below is a list of columns to pass if you want the NA values changed to blanks.
    na_cols_to_check = [
        'posteam', 'defteam', 'drive', 'air_yards', 'yards_after_catch',
        'yards_gained', 'cp', 'cpoe', 'ep', 'epa', 'air_epa', 'yac_epa',
        'comp_air_epa', 'comp_yac_epa', 'air_wpa', 'yac_wpa', 'comp_air_wpa',
        'comp_yac_wpa', 'qb_epa', 'xyac_epa', 'xyac_mean_yardage',
        'xyac_median_yardage', 'wp', 'wpa', 'def_wp', 'vegas_wp', 'home_wp',
        'away_wp', 'score_differential', 'pass_attempt', 'pass_touchdown',
        'complete_pass', 'passer_player_name', 'passer', 'pass'
    ]

    DB[na_cols_to_check] = DB[na_cols_to_check].replace('NA',
                                                        np.nan).fillna('')
    # All teams will be included in the sample unless you pass a list of specific teams below.
    Teams = list(np.unique(DB.posteam))
    Teams.remove('')

    data = []
    for Team in Teams:
        teamDF = DB[((DB.posteam == Team) | (DB.defteam == Team))]

        print(Team, year)

        passer_list = list(np.unique(teamDF[(teamDF.posteam == Team)].passer))
        if '' in passer_list:
            passer_list.remove('')
        for qb in passer_list:
            print(qb)
            qbDF = teamDF[(teamDF.passer == qb) & (teamDF['pass'] == 1) &
                          (teamDF.xyac_mean_yardage != '')].infer_objects()
Example #46
0
 def concat_ncapture_yields(self, r_elements, s_elements):
     """Create an array of r- and s-process isotopic yields."""
     nclib = pickle_read(join(self.path_yldgen, 'sneden08.pck'))
     # unique elements arranged by atomic number
     elements = np.unique(r_elements + s_elements)
     at_num = []
     for item in elements:
         at_num.append(nclib[item]['Z'])
     at_num = np.array(at_num)
     elements = elements[np.argsort(at_num)]
     self.nc_sym = []
     self.nc_sym_mass = []
     for item in elements:
         n_tmp = len(nclib[item]['Isotope'])
         for i in range(n_tmp):
             self.nc_sym.append(item + str(nclib[item]['Isotope'][i]))
             self.nc_sym_mass.append(nclib[item]['Isotope'][i])
     self.nc_sym = np.array(self.nc_sym)
     self.nc_sym_mass = np.array(self.nc_sym_mass)
     self.n_nc_sym = len(self.nc_sym)
     u, indices = np.unique(
         [item.rstrip('0123456789') for item in self.nc_sym],
         return_index=True)
     indices_s = np.argsort(indices)
     self.nc_element = u[indices_s]
     # project elemental yields onto relative isotopic abundances
     self.nc_yields = np.zeros((self.n_z, self.n_bins, self.n_nc_sym))
     cnt = 0
     for i in range(len(elements)):
         el = elements[i]
         el_iso = len(nclib[el]['Isotope'])
         if el in r_elements:
             j = np.where(np.array(r_elements) == el)[0]
             self.nc_yields[:, -self.n_bins_high:, cnt:cnt+el_iso] = \
                            (np.ones((self.n_z, self.n_bins_high, el_iso)) *
                             self.rprocess_yields[:, -self.n_bins_high:, j] *
                             nclib[el]['isotopic_fraction[r]'])
         if el in s_elements:
             j = np.where(np.array(s_elements) == el)[0]
             self.nc_yields[:, :self.n_bins_low, cnt:cnt+el_iso] = \
                             (np.ones((self.n_z, self.n_bins_low, el_iso)) *
                              self.sprocess_yields[:, :, j] *
                              nclib[el]['isotopic_fraction[s]'])
         cnt += el_iso
     # update arrays
     self.sym = np.append(self.snii_sym, self.nc_sym)
     self.sym_mass = np.append(self.snii_sym_mass, self.nc_sym_mass)
     self.n_sym = len(self.sym)
     self.element = np.append(self.element, self.nc_element)
     self.n_elements = len(self.element)
     self.bbmf = np.append(self.bbmf, np.zeros(self.n_nc_sym))
     if len(self.snia_yields.shape) == 1:
         # metallicity-independent SNIa yields
         self.snia_yields = np.append(self.snia_yields,
                                      np.zeros(self.n_nc_sym))
     elif len(self.snia_yields.shape) == 2:
         # metallicity-dependent SNIa yields
         self.snia_yields = np.append(self.snia_yields,
                                      np.zeros((self.nc_yields.shape[0],
                                                self.nc_yields.shape[2])),
                                      axis=1)
     self.snii_yields = np.append(self.snii_yields,
                                  self.nc_yields[:, self.ind8:],
                                  axis=2)
     self.agb_yields = np.append(self.agb_yields,
                                 self.nc_yields[:, :self.ind8],
                                 axis=2)
     self.snii_agb_rem = np.concatenate((self.agb_rem, self.snii_rem),
                                        axis=1)
Example #47
0
File: flow.py Project: omker04/MNL
        'price': 'mean',
        'market_size': 'mean',
        'outside': 'median'
    }
    df = df.groupby(sum([['upc_nbr'], attributes],
                        [])).agg(aggregate).reset_index()
    df['dollar'] = df['dollar'] / on_hand_wks
    df['quantity'] = df['quantity'] / on_hand_wks
    df['lprice'] = np.log(df['price'])
    df['lnsr'] = np.log(df['quantity'] / df['outside'])
    df.drop_duplicates(inplace=True)
    return df


allFunc3 = lapply(allFunc, lambda y: lapply(y, lambda x: get_allFunc3(x, 52)))
all_segments = [k for k in np.unique(attribute_data[segment_by[0]])]


def check_segments(df):
    absent_segments = [x for i, x in enumerate(all_segments) if x not in df]
    if len(absent_segments) > 0:
        for k in absent_segments:
            df[k] = pd.DataFrame(
                columns=sum([['upc_nbr'], attributes,
                             [
                                 'price', 'dollar', 'store_nbr', 'outside',
                                 'quantity', 'market_size', 'lprice', 'lnsr'
                             ]], []))
    else:
        pass
    return df
Example #48
0
File: flow.py Project: omker04/MNL
def MNL_withoutSimilarity_Prediction_Multiple_Drop(df_all,
                                                   model,
                                                   deleted,
                                                   segmented=True):
    attributeVariable = attributes
    if segmented:
        temp = pd.DataFrame(attributes).merge(pd.DataFrame(segment_by),
                                              how='outer',
                                              indicator=True)
        attributeVariable = list(temp.query('_merge == "left_only"')[0])
    if 'rollback' in df_all.columns.values:
        df_all['rollback'].fillna(0)
    else:
        df_all['rollback'] = 0
    if 'lprice' in df_all.columns.values:
        df_all = df_all.drop('lprice', axis=1)
    df_all['lprice'] = np.log(df_all['price'])
    if df_all['market_size'].isnull().sum() > 0:
        marketSize = np.unique(df_all.market_size)[[
            math.isnan(x) for x in np.unique(df_all.market_size)
        ].index(False)]
        df_all.drop('market_size', axis=1, inplace=True)
        df_all['market_size'] = marketSize
    if df_all['store_nbr'].isnull().sum() > 0:
        storeNbr = np.unique(df_all.store_nbr)[[
            math.isnan(x) for x in np.unique(df_all.store_nbr)
        ].index(False)]
        df_all.drop('store_nbr', axis=1, inplace=True)
        df_all['store_nbr'] = storeNbr
    whichStore = np.unique(df_all.store_nbr)[0]
    model = model[whichStore]
    selected_mnl = model
    if segmented:
        selected_mnl = model[np.unique(df_all[segment_by])[0]]
    dummyList = {'other_columns': df_all.loc[:, ['rollback', 'lprice']]}
    for k in attributeVariable:
        dummyList[k] = pd.get_dummies(df_all[k], prefix=k, prefix_sep='__')
        dummyList[k] = dummyList[k].T.reindex(
            k + '__' + attribute_data_factor[k]).T.fillna(0)
    getDummy = pd.concat(dummyList, axis=1)
    getDummy['market_size'] = df_all['market_size']
    pred_no_drop_mnl = predict_mnl(getDummy, selected_mnl)
    df_all['predicted_outside_good'] = pred_no_drop_mnl[
        'predicted_outside_good']
    df_all['predicted_quantity'] = pred_no_drop_mnl['predicted_quantity']
    no_drop_df = deepcopy(df_all)
    if len(deleted) == 0 | len(deleted) > len(df_all):
        output_df = pd.DataFrame(columns=sum(
            [['store_nbr', 'upc_nbr'], attributes,
             [
                 'rollback', 'dollar', 'quantity', 'price', 'lprice',
                 'market_size', 'add', 'adjusted_predicted_quantity_post_drop',
                 'predicted_demand_transfer', 'walkoff'
             ]], []))
        output = {'output': output_df}
    elif len(deleted) == len(df_all):
        output_df = df_all.loc[:,
                               sum([['store_nbr', 'upc_nbr'], attributes,
                                    [
                                        'rollback', 'dollar', 'quantity',
                                        'price', 'lprice', 'market_size', 'add'
                                    ]], [])]
        output_df['adjusted_predicted_quantity_post_drop'] = float('nan')
        output_df['predicted_demand_transfer'] = float('nan')
        output_df['walkoff'] = 100
        output = {'output': output_df}
    else:
        df_all = df_all.drop(df_all.index[deleted])
        dummyList = {'other_columns': df_all.loc[:, ['rollback', 'lprice']]}
        for k in attributeVariable:
            dummyList[k] = pd.get_dummies(df_all[k], prefix=k, prefix_sep='__')
            dummyList[k] = dummyList[k].T.reindex(
                k + '__' + attribute_data_factor[k]).T.fillna(0)
        getDummy = pd.concat(dummyList, axis=1)
        getDummy['market_size'] = df_all['market_size']
        pred_drop_mnl = predict_mnl(getDummy, selected_mnl)
        df_all['predicted_quantity_post_drop'] = pred_drop_mnl[
            'predicted_quantity']
        no_drop_df = no_drop_df.merge(df_all, how='left')
        whichAdded = [
            i for i, x in enumerate(no_drop_df.quantity) if math.isnan(x)
        ]
        no_drop_df.quantity.iloc[
            whichAdded] = no_drop_df.predicted_quantity.iloc[whichAdded]
        # if sum(no_drop_df['predicted_outside_good']) == 0 :
        #     for i in range(len(no_drop_df)) :
        #         k = no_drop_df.predicted_quantity_post_drop.iloc[i]
        #         if math.isnan(k) :
        #             k = no_drop_df.quantity.iloc[i] + 1
        #         elif k <= no_drop_df.quantity.iloc[i] :
        #             k = k + 1
        #         else :
        #             k = k
        #         rand = np.random.uniform(no_drop_df.quantity.iloc[i], k)
        #         print(rand)
        #         no_drop_df.predicted_quantity.iloc[i] = rand
        no_drop_df[
            'adjusted_predicted_quantity_post_drop'] = no_drop_df.quantity * no_drop_df.predicted_quantity_post_drop / no_drop_df.predicted_quantity
        deleted_total = sum(no_drop_df.quantity.iloc[deleted])
        no_drop_df['predicted_demand_transfer'] = 100 * (
            no_drop_df['adjusted_predicted_quantity_post_drop'].fillna(0) -
            no_drop_df['quantity'].fillna(0)) / deleted_total
        no_drop_df['predicted_demand_transfer'].iloc[deleted] = float('nan')
        walkoff = 100 - sum(no_drop_df['predicted_demand_transfer'].fillna(0))
        no_drop_df['walkoff'] = walkoff
        no_drop_df.quantity.iloc[whichAdded] = float('nan')
        return_df_cols = sum(
            [['store_nbr', 'upc_nbr'], attributes,
             [
                 'rollback', 'dollar', 'quantity', 'price', 'lprice',
                 'market_size', 'add', 'adjusted_predicted_quantity_post_drop',
                 'predicted_demand_transfer', 'walkoff'
             ]], [])
        output = {
            'output': no_drop_df.loc[:, return_df_cols],
            'no_drop_df': no_drop_df,
            'df_all': df_all,
            'walkoff': walkoff
        }
    return output
Example #49
0
def compute_prior(annotations, centroids, image_size, superpixel_grid, n=100):
    '''
    Compute relative location prior according to Gould et al. (2008)

    Parameters
    ----------
    ds : string
        String indicating the dataset to be used.

    Returns
    -------
    maps : pandas.Panel4D
        4D panel containing the relative location prior maps: maps[<other class>][<given class>]
        gives a n*n dataframe representing the dimensionless image map

    Other parameters
    ----------------
    n : integer
        Half the size of the dimensionless image map

    '''

    nm, nn = image_size
    nx, ny = superpixel_grid

    annotations = np.reshape(annotations, (nx, ny))
    classes = np.unique(annotations)

    # allocate normalized relation maps
    mapi = {
        c1: {c2: np.zeros((2 * n, 2 * n))
             for c2 in classes}
        for c1 in classes
    }

    # get centroids
    centroids = zip(*list(centroids))
    m_mat = np.array(centroids[0]).reshape((nx, ny))
    n_mat = np.array(centroids[1]).reshape((nx, ny))

    # normalize centroid coordinates to map grid
    m_mat = np.round(m_mat.astype(np.float32) / (nm - 1) * (n - 1))
    n_mat = np.round(n_mat.astype(np.float32) / (nn - 1) * (n - 1))

    # loop over all superpixels
    for i in range(nx):
        for j in range(ny):
            # list indices in dimensionless image map that match the superpixel centroids
            ind_m = (m_mat - m_mat[i, j] + n - 1).ravel().astype(np.uint)
            ind_n = (n_mat - n_mat[i, j] + n - 1).ravel().astype(np.uint)

            # determine class of current superpixel
            c0 = annotations[i, j]

            # loop over classes in current image
            for c in classes:

                # add score matrix with offset to relation map
                mapi[c][c0][ind_m, ind_n] += (annotations == c).astype(
                    np.uint).ravel()

    return mapi
Example #50
0
    
    
    #%%
    oas = []
    f1s = []
    errs = []

    #%%
    mode = 1 #### change here!!!!!!!!!!!!!!!!
    opendic = 0.5
    seedi = 0
    for seedi in range(10):
#    for seedi in [0,2,3,5,7,8,9]:
        gt1file = glob.glob('data/'+key+data+'*')[0]
        gt1 = np.load(gt1file)
        inclass = np.unique(gt1)
        unknown = gt1.max()+1
        gt2 = np.load(gt2file)
        gt1[np.logical_and(gt1==0,gt2!=0)] = unknown
        
        # OA
        gt = copy.deepcopy(gt1)
        pre = read(fp,mode,key,seedi,opendic=opendic,cls1=unknown,num=int(num))
        cfm = rscls.gtcfm(pre,gt,unknown)
        oas.append(cfm[-1,0])
        
        # F1
        gt = gt.reshape(-1)
        pre = pre.reshape(-1)
        pre = pre[gt!=0]
        gt = gt[gt!=0]
Example #51
0
import numpy as np

a = [-1, -1, 1]
(values, counts) = np.unique(a,return_counts=True)
ind=np.argmax(counts)
print(values[ind])
Example #52
0
comm_url = "http://web.stanford.edu/group/poldracklab/myconnectome-data/base" \
           "/parcellation/parcel_data.txt"
comm = np.loadtxt(urlopen(comm_url).readlines(), delimiter='\t', dtype=str)

# The ninth column of this array specifies the Yeo 7-network affiliation of the
# nodes with e.g., "7Network_1," "7Network_2." We'd prefer to just have a
# numerical vector specifying network assignments, so we convert it here. We'll
# also list the actual Yeo networks, as well, so that we can plot them.

# the last two "communities" are the Freesurfer medial wall and subcortex
comm_labels = [
    'visual', 'somatomotor', 'dorsal attention', 'ventral attention', 'limbic',
    'frontoparietal', 'default', '', ''
]
comm_ids = np.unique(comm[:, 8], return_inverse=True)[-1]

# Now we can actually plot things! First, make a little grid of plots to
# approximately match the layout from Figure 1.

from matplotlib.gridspec import GridSpec

fig = plt.figure(figsize=(10, 10))
gs = GridSpec(2, 2, figure=fig)
ax1 = plt.subplot(gs[0, 0])
ax2 = plt.subplot(gs[0, 1])
ax3 = plt.subplot(gs[1, :])

# Convert the bootstrap ratios into a node x node matrix of functional weights
# and plot them, sorting by community assignment. This will give us an idea of
# which communities / networks are contributing most.
Example #53
0
def Roc_curve(y_test, y_score, n_classes):
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from itertools import cycle
    from sklearn.metrics import roc_curve, auc
    from scipy import interp

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    l = len(set(y_score))

    for i in range(l):
        fpr[i], tpr[i], _ = roc_curve(
            np.array(pd.get_dummies(y_test))[:, i],
            np.array(pd.get_dummies(y_score))[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(l)]))

    mean_tpr = np.zeros_like(all_fpr)
    for i in range(l):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    mean_tpr = mean_tpr / (n_classes - 1)

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    lw = 2
    plt.figure(figsize=(8, 5))
    plt.plot(fpr["macro"],
             tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["macro"]),
             color='green',
             linestyle=':',
             linewidth=4)

    colors = cycle([
        'chocolate', 'aqua', 'darkorange', 'cornflowerblue', 'cadetblue',
        'burntsienna', 'cornflowerblue'
    ])
    for i, color in zip(range(l), colors):
        plt.plot(fpr[i],
                 tpr[i],
                 color=color,
                 lw=lw,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', color='red', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.annotate('Random Guess', (.5, .48), color='red')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic for Multi Layered Perceptron')
    plt.legend(loc="lower right")
    plt.show()
Example #54
0
# Load the means, stds, folder_names, file_names from file "mean_std_names.npz"
loaded = np.load("mean_std_names.npz")
means = loaded['means']
stds = loaded['stds']
folder_names = loaded['folder_names']

# Your code here #
# Compute squared distance
distance_to_target = (means - 110)**2 + (stds - 81)**2
# Get indices of data points, sorted by distance
distance_to_target_inds = np.argsort(distance_to_target)
# Get folder_names of 100 closest data point indices
close_folder_names = folder_names[distance_to_target_inds[:100]]
# Check which folder name occurred a lot in the closest points
close_folder_names, counts = np.unique(close_folder_names, return_counts=True)
# Print folder names and counts
print(f"Close to cluster: {list(zip(close_folder_names, counts))}")
# Answer: Folder 080 is the culprit!

#
# Task 03
#

# Use UMAP to cluster the data instead of t-SNE.
# Simply follow the guide on
# https://umap-learn.readthedocs.io/en/latest/clustering.html

#
# Task 04
#
Example #55
0
# train
train(directories, dataset_settings, network_settings, train_settings)

#%%
""" Train on OSCD-dataset, fixed patches """
from train import train
directories['data_path'] = os.path.join(directories['intermediate_dir_cd'],
                                        directories['data_dir_oscd'])

directories['labels_path'] = os.path.join(directories['intermediate_dir_cd'],
                                          directories['labels_dir_oscd'])

dataset_train = pd.read_csv(
    os.path.join(directories['results_dir'], 'train_patches_oscd.csv'))
indices = np.unique(dataset_train['im_idx'].values)

# shuffle
np.random.seed(567)
dataset = np.random.choice(indices, len(indices), replace=False)

# splits for cross-validation
k = 2  # 'full' for complete training set | 0, ... , n
folds = 3
if k != 'full':
    assert k < folds
    ims_per_fold = int(len(dataset) / folds)
    # train / val split
    val_indices = dataset[k * ims_per_fold:(k + 1) * ims_per_fold]
    train_indices = dataset[np.isin(dataset, val_indices) == False]
else:
Example #56
0
def melt_plot_facet_grid(input_df, dim: str, sexe: str,
                         hue_var: str, groupby_var: str,
                         plot_facet_grid=None) -> pd.DataFrame:
    """
    Function to melt pd.Dataframe for plotting purposes
    :param input_df: pd.DataFrame to be melted
    :param dim: dimension to plot:  choose: "i" for ingeschrevenen or "d" for gediplomeerden
    :param sexe: numeric sexe to plot: "man", "vrouw" or "total"
    :param hue_var: HUE variable on wich dimension to split in each plot
    :param groupby_var: value to groupby on
    :return: melted pd.DataFrame or FacetGrid
    """
    dim = dim
    # years differ per ingeschrevenen or gediplomeerden and per update. fix:
    dim_list = input_df.filter(regex=rf'_{dim}$').columns.tolist()
    years = np.unique(list(map(lambda sub: int(''.join(
        [x for x in sub if x.isnumeric()])), dim_list)))

    if sexe == 'man':
        dim_cols = [f"{years[0]}_man_{dim}", f"{years[1]}_man_{dim}", f"{years[2]}_man_{dim}",
                    f"{years[3]}_man_{dim}", f"{years[4]}_man_{dim}"]
    elif sexe == 'vrouw':
        dim_cols = [f"{years[0]}_vrouw_{dim}", f"{years[1]}_vrouw_{dim}", f"{years[2]}_vrouw_{dim}",
                    f"{years[3]}_vrouw_{dim}", f"{years[4]}_vrouw_{dim}"]
    elif sexe == 'total':
        dim_cols = [f"{years[0]}_tot_{dim}", f"{years[1]}_tot_{dim}", f"{years[2]}_tot_{dim}",
                    f"{years[3]}_tot_{dim}", f"{years[4]}_tot_{dim}"]

    id_vars = [groupby_var, hue_var]
    value_name = "".join(["ingeschreven" if x == 'i' else 'gediplomeerden' for x in dim])

    melt_frame = (
        pd.melt(
            frame=input_df,
            id_vars=id_vars,
            value_vars=dim_cols,
            var_name=sexe,
            value_name=value_name
        )
            .sort_values(
            by=[
                groupby_var,
                hue_var,
                sexe,
            ]
        )
            .groupby([groupby_var, hue_var, sexe])
            .agg({value_name: "sum"})
            .reset_index()
    )
    melt_frame[sexe] = (melt_frame[sexe]
                        .apply(lambda x: re.sub("[^0-9]", " ", x)))
    melt_frame[sexe] = melt_frame[sexe].astype(int)

    if plot_facet_grid:

        no = len(melt_frame[hue_var].unique())
        palette = dict(zip(melt_frame[hue_var].unique(), sns.color_palette("rocket_r", no)))

        grid = sns.FacetGrid(
            melt_frame,
            col=groupby_var,
            palette=palette,
            col_wrap=4,
            hue=hue_var,
            sharex=False,
            sharey=False,
            height=5,
            aspect=1.5,
        )

        grid.map(plt.axhline, y=melt_frame[value_name].mean(), ls=":", c=".5")
        grid.map(plt.plot, sexe, value_name, marker="o")
        grid.add_legend()

        for ax in grid.axes.flat:
            _ = plt.setp(
                ax.get_xticklabels(), visible=True, size=12
            )  ## set proporty of an artist object
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))

        return grid

    return melt_frame
import numpy as np
import sklearn
from sklearn.preprocessing import scale
from sklearn.datasets import load_digits
from sklearn.cluster import KMeans

digits = load_digits()
data = scale(digits.data)
y = data.targets

k = len(np.unique(y))
sample, features = data.shape
import numpy as np
import math as mth
import pynbody
import h5py
import sys

sim_type=sys.argv[1]           #'dm_only' 'dm_gas'
cosmology=sys.argv[2]          #DMONLY:'lcdm'  'cde0'  'wdm2'DMGAS: 'lcdm' 'cde000' 'cde050' 'cde099'
snapshot=int(sys.argv[3])      #'12  '11'...
den_type='my_den'              #not an option for this code
grid_nodes=1250                #density resolution

#Assigns v to angular momentum of halos
f=pynbody.load("/scratch/GAMNSCM2/%s/%s/snapshot_0%s/snapshot_0%s"%(sim_type,cosmology,snapshot,snapshot))
a=f.dm['pos']
dm_mass=np.unique(f.dm['mass'])
if sim_type=='dm_gas': 
    a=np.vstack((a,f.gas['pos']))
    gas_mass=np.unique(f.gas['mass'])

Xc=np.asarray(a[:,0]).astype(float)
Yc=np.asarray(a[:,1]).astype(float)
Zc=np.asarray(a[:,2]).astype(float)

Xc_min=np.min(Xc)
Xc_max=np.max(Xc)
Yc_min=np.min(Yc)
Yc_max=np.max(Yc)
Zc_min=np.min(Zc)
Zc_max=np.max(Zc)
Example #59
0
def run_tones(rs=None, 
    output_dir='/media/dendrite',
    all_channels_file='../DO1_ALL_CHANNELS', 
    channel_groups_file='../DO1_CHANNEL_GROUPS', 
    analog_channels_file='../ANALOG_7_8',
    ns5_filename=None,
    remove_from_TC=None,
    soft_time_limits=(-1.0, 1.0),
    hard_time_limits=(-.04, .15),
    do_add_bcontrol=True,
    bcontrol_folder='/media/hippocampus/TRWINRIG_DATA/Data/SPEAKERCAL/',
    bcontrol_files=None,
    n_tones=None,
    n_tones_per_bout=200,
    TR_NDAQ_offset_sec=420,
    start_offset=0,
    stop_offset=0,
    do_timestamps=True,
    plot_spectrograms=True,
    break_at_spectrograms=False,
    force_put_neural_data=False,
    do_avg_plots=True,
    do_extract_spikes=True,
    detection_kwargs=None,
    CAR=True,
    save_to_klusters=True,
    do_MUA_grand_plot=True,
    group_multiplier=100,
    psth_time_limits=(None,None),
    do_tuning_curve=True,
    **kwargs):
    """Daily run script for tones (tuning curve)
    
    rs : RS if it exists. If it doesn't, provide the following:
        output_dir, all_channels_file, channel_groups_file, 
        analog_channels_file, ns5_filename, remove_from_TC, soft_time_limits,
        hard_time_limits
    
    do_add_bcontrol: if True, will find bcontrol files, extract information,
        write "tones" and "attens" to directory. (If those files already exist,
        then this block is skipped, so delete them if you want to force.)

        You can specify explicitly, or else it will search.
        
        bcontrol_files : a list of bcontrol files that you've selected.
        
        bcontrol_folder : If bcontrol_files is None, then will look here
            for them. Will try to guess from ns5 time which are appropriate.
            It will keep grabbing files until it find at least `n_tones`
            tones. If n_tones is None, uses the number of timestamps.

        In either case, the mat files are copied into the directory, and then
        the `tones` and `attens` files are written. Those files are used
        for all subsequent analyses.
    
    plot_spectrograms: if True, will plot spectrograms of the audio stimulus
        for every 200 tones, in order to check that the tones and attens 
        are correctly lined up.
        
        break_at_spectrograms : if True, errors immediately after plotting
        spectrograms, for debugging
    
    do_tuning_curve : if True, plots tuning curve
    
    n_tones_per_bout : int, or list of ints
        Number of tones expected in each bout.
        This is used for calculating timestamps of each tone, from timestamps
        of each trial (which corresponds to a single bcontrol file).
    
    Other parameters should be same as other Dailies.
    """
    if len(kwargs) > 0:
        print("unexpected kwargs")
        print(kwargs)
    
    # Make session
    if rs is None:
        printnow("creating recording session %s" % ns5_filename)
        rsm = rswrap.RecordingSessionMaker(
            data_analysis_dir=output_dir,
            all_channels_file=all_channels_file,
            channel_groups_file=channel_groups_file,
            analog_channels_file=analog_channels_file)

        if remove_from_TC is None:
            remove_from_TC = []

        rs = rsm.make_session(
            ns5_filename=ns5_filename,
            remove_from_TC=remove_from_TC)
    
        rs.write_time_limits(soft_time_limits, hard_time_limits)
    rs.group_multiplier = group_multiplier
    printnow("RS %s" % rs.full_path)

    # add timestamps
    if do_timestamps:
        printnow("adding timestamps")
        # write timestamps to directory
        # have to force, otherwise will sub-time it twice
        times, numbers = rswrap.add_timestamps_to_session(rs, verbose=True, 
            force=True, meth='digital_trial_number')
        
        # Right now one time per trial (bcontrol file)
        # Need to decimate by number of trials per bout
        # First figure out whether int or list of ints
        if not hasattr(n_tones_per_bout, '__len__'):
            n_tones_per_bout = [n_tones_per_bout] * len(times)

        # Now decimate each bout
        # This command works for 200 tones, extrapolate correct formula from it
        # subtimes = np.rint(np.linspace(3300, 1194210, 200)).astype(np.int)
        alltimes = []
        for time, n_tones in zip(times, n_tones_per_bout):
            istart = 3300
            istop = istart + np.rint((n_tones - 1) * 5984.5).astype(np.int)
            subtimes = np.rint(np.linspace(istart, istop, n_tones)).astype(np.int)
            alltimes.append(time + subtimes)
        alltimes = np.concatenate(alltimes)
        rs.add_timestamps(alltimes)        
    
    # add bcontrol
    tone_filename = os.path.join(rs.full_path, 'tones')
    atten_filename = os.path.join(rs.full_path, 'attens')
    if do_add_bcontrol and (not os.path.exists(tone_filename) or not \
        os.path.exists(atten_filename)):
        printnow('adding bcontrol')
        
        # First find out how many tones there probably are
        if n_tones is None:
            n_tones = len(rs.read_timestamps())
        
        if bcontrol_files is None:
            # Guess by ns5 filetime
            ns5_stoptime = gettime(rs.get_ns5_filename())
            ns5_startime = ns5_stoptime - datetime.timedelta(seconds=
                old_div(rs.get_ns5_loader().header.n_samples, rs.get_sampling_rate()))
            ns5_stoptime += datetime.timedelta(seconds=TR_NDAQ_offset_sec)
            ns5_startime += datetime.timedelta(seconds=TR_NDAQ_offset_sec)
            mintime = ns5_startime + datetime.timedelta(seconds=start_offset)
            maxtime = ns5_stoptime + datetime.timedelta(seconds=stop_offset)
            
            # Find the bcontrol files that were saved during the recording
            # And sort by time
            allfiles = np.asarray(glob.glob(os.path.join(
                bcontrol_folder, 'speakercal*.mat')))
            bcontrol_filetimes = np.asarray(list(map(gettime, allfiles)))
            sidxs = np.argsort(bcontrol_filetimes)
            bcontrol_filetimes = bcontrol_filetimes[sidxs]
            allfiles = allfiles[sidxs]
            
            # Choose the files within the window
            check_idxs = np.where(
                (bcontrol_filetimes > mintime) & 
                (bcontrol_filetimes < maxtime))[0]
            
            # Iterate through the found files until a sufficient number
            # of tones have been found
            n_found_tones = 0
            found_files = []
            for check_idx in check_idxs:
                # Load file
                filename = allfiles[check_idx]
                tl = myutils.ToneLoader(filename)
                
                # Skip if WN
                if not np.all(tl.tones == 0):
                    found_files.append(filename)
                    n_found_tones += len(tl.tones)
                
                # Break if enough found
                if n_found_tones >= n_tones:
                    break

            # Output debugging info
            print("I found %d tones in %d files" % (
                n_found_tones, len(found_files)))
            if n_found_tones < n_tones:
                print("insufficient tones found ... try increasing start delta")
            
            # More debugging info about first file
            print("Using general offset of " + str(TR_NDAQ_offset_sec) + " ....")
            idx1 = np.where(allfiles == found_files[0])[0]
            offsets = bcontrol_filetimes[idx1-1:idx1+2] - ns5_startime
            poffsets1 = [offset.seconds if offset > datetime.timedelta(0) 
                else -(-offset).seconds for offset in offsets]
            print("First file (prev,curr,next) offsets from start: %d %d %d" % \
                (poffsets1[0], poffsets1[1], poffsets1[2]))
            
            # And last file
            idx1 = np.where(allfiles == found_files[-1])[0]
            offsets = bcontrol_filetimes[idx1-1:idx1+2] - ns5_stoptime
            poffsets2 = [offset.seconds if offset > datetime.timedelta(0) 
                else -(-offset).seconds for offset in offsets]
            print("Last file (prev,curr,next) offsets from stop: %d %d %d" % \
                (poffsets2[0], poffsets2[1], poffsets2[2]))

            # Now put in forward order
            bcontrol_files = np.asarray(found_files)
            
            # Debugging output
            print("Like these results? Here's how to replicate:")
            print("<speakercal_files>")
            for bcf in bcontrol_files:
                print(os.path.split(bcf)[1])
            print("</speakercal_files>")
            print("clock_offset='%d' start_offset='%d %d %d' stop_offset='%d %d %d'" % (
                TR_NDAQ_offset_sec, 
                poffsets1[0], start_offset, poffsets1[1], 
                poffsets2[1], stop_offset, poffsets2[2]))
        
        # Add to RS
        if bcontrol_files is not None:
            for file in bcontrol_files:
                rs.add_file(file)
    
        # Now that we've settled on a canonical bcontrol file ordering,
        # dump tones and attens
        tls = [myutils.ToneLoader(file) for file in bcontrol_files]
        tones = np.concatenate([tl.tones for tl in tls])
        attens = np.concatenate([tl.attens for tl in tls])  
        np.savetxt(tone_filename, tones)
        np.savetxt(atten_filename, attens, fmt='%d')
        

    if plot_spectrograms:
        tones = np.loadtxt(tone_filename)
        attens = np.loadtxt(atten_filename, dtype=np.int)
        
        # verify timestamps
        timestamps = rs.read_timestamps()
        if len(timestamps) < len(tones):
            print("warning not enough timestamps, discarding tones: " + \
                "%d timestamps but %d tones" % (
                len(timestamps), len(tones)))
            tones = tones[:len(timestamps)]
            attens = attens[:len(timestamps)]
        elif len(timestamps) > len(tones):
            print("warning too many timestamps, provide more tones: " + \
                "%d timestamps but %d tones" % (
                len(timestamps), len(tones)))
        
        # check spectrograms
        # plot debugging spectrograms of audio
        l = rs.get_ns5_loader()
        raw = l.get_chunk_by_channel()
        ain135 = raw[135]
        ain136 = raw[136]
        
        # Spectrogrammer object
        sg = myutils.Spectrogrammer(NFFT=1024, Fs=30e3, max_freq=30e3, 
            min_freq=0, noverlap=512, downsample_ratio=1)
        
        # Fake toneloader to calculate aliased tones
        tl = myutils.ToneLoader()
        tl.tones = tones
        aliased_tones = tl.aliased_tones()
        
        for n in range(0, len(tones) - 5, 200):
            ts = timestamps[n]
            known_tones = aliased_tones[n:n+5]
            slc1 = ain135[ts:ts+30e3]
            slc2 = ain136[ts:ts+30e3]
            
            # Transform and plot
            Pxx, freqs, t = sg.transform(np.mean([slc1, slc2], axis=0))
            myutils.my_imshow(Pxx, t, freqs)
            plt.axis('auto')
            
            # Title with known tones
            plt.title('tl%d %0.1f %0.1f %0.1f %0.1f %0.1f' % (
                n, known_tones[0], known_tones[1], known_tones[2], 
                known_tones[3], known_tones[4]))
            
            # Save to RS
            plt.savefig(os.path.join(rs.full_path, 'tones_%d.png' % n))
            plt.close()
        
        if break_at_spectrograms:
            old_div(1,0)

    # put in neural db (does nothing if exists unless forced)
    printnow('putting neural data')
    rs.put_neural_data_into_db(verbose=True, force=force_put_neural_data)

    # plot averages
    if do_avg_plots:
        printnow("avg plots")
        rswrap.plot_avg_lfp(rs, savefig=True)
        rswrap.plot_avg_audio(rs, savefig=True)

    # spike extract
    if do_extract_spikes:
        printnow('extracting spikes')
        rs.generate_spike_block(CAR=CAR, smooth_spikes=False, verbose=True)
        rs.run_spikesorter(save_to_db=True, save_to_klusters=save_to_klusters,
            detection_kwargs=detection_kwargs)
        rs.spiketime_dump()

    # plot MUA stuff
    if do_MUA_grand_plot:
        printnow('mua grand psths')        
        rswrap.plot_all_spike_psths(rs, savefig=True)
        
    
    # make a tuning curve
    if do_tuning_curve:
        # extract tones and attens from each
        tones = np.loadtxt(tone_filename)
        attens = np.loadtxt(atten_filename, dtype=np.int)
        if len(timestamps) < len(tones):
            print("warning not enough timestamps, discarding tones: " + \
                "%d timestamps but %d tones" % (
                len(timestamps), len(tones)))
            tones = tones[:len(timestamps)]
            attens = attens[:len(timestamps)]
        elif len(timestamps) > len(tones):
            print("warning too many timestamps, provide more tones: " + \
                "%d timestamps but %d tones" % (
                len(timestamps), len(tones)))        
        
        
        # parameters for tuning curve
        tc_freqs = 10 ** np.linspace(np.log10(5e3), np.log10(50e3), 15)
        tc_attens = np.unique(attens)

        # Determine which bin each trial belongs to
        tone_freq_bin = np.searchsorted(tc_freqs, tones, side='right') - 1
        tone_atten_bin = np.searchsorted(tc_attens, attens, side='right') - 1
        
        # spike count for each trial
        group = 5
        spike_time_file = os.path.join(rs.last_klusters_dir(),
            '%s.res.%d' % (rs.session_name, group))
        spike_times = np.loadtxt(spike_time_file, dtype=np.int)
        timestamps = rs.read_timestamps()
        spike_counts = count_within_window(timestamps, spike_times,
            .005*30e3, .030*30e3)
        
        # reshape into tuning curve
        tc_mean = np.zeros((len(tc_attens), len(tc_freqs) - 1))
        tc_std = np.zeros((len(tc_attens), len(tc_freqs) - 1))        
        tc_median = np.zeros((len(tc_attens), len(tc_freqs) - 1))        
        for n, tc_freq in enumerate(tc_freqs[:-1]):
            for m, tc_atten in enumerate(tc_attens):
                # Which tones go into this bin
                tone_idxs = np.where(
                    (tone_freq_bin == n) & (tone_atten_bin == m))[0]
                if len(tone_idxs) == 0:
                    print("none in this bin %f %d" % (tc_freq, tc_atten))
                    continue        
                
                tc_mean[m, n] = np.mean(spike_counts[tone_idxs])
                tc_median[m, n] = np.median(spike_counts[tone_idxs])
                tc_std[m, n] = np.std(spike_counts[tone_idxs])
        
        # plot
        np.savez('data', tc_mean=tc_mean, tc_freqs=tc_freqs, tc_attens=tc_attens)
        myutils.my_imshow(tc_mean, tc_freqs, tc_attens, cmap=plt.cm.Purples)
        plt.axis('tight')
        plt.colorbar()
        myutils.my_imshow(tc_median, tc_freqs, tc_attens, cmap=plt.cm.Purples)
        plt.colorbar()
        myutils.my_imshow(tc_std, tc_freqs, tc_attens, cmap=plt.cm.gray)
        plt.colorbar()
        plt.show()
    
    return rs
def waverecn(coeffs, wavelet, mode='symmetric', axes=None):
    """
    Multilevel nD Inverse Discrete Wavelet Transform.

    coeffs : array_like
        Coefficients list [cAn, {details_level_n}, ... {details_level_1}]
    wavelet : Wavelet object or name string
        Wavelet to use
    mode : str, optional
        Signal extension mode, see Modes (default: 'symmetric')
    axes : sequence of ints, optional
        Axes over which to compute the IDWT.  Axes may not be repeated.

    Returns
    -------
    nD array of reconstructed data.

    Examples
    --------
    >>> import numpy as np
    >>> from pywt import wavedecn, waverecn
    >>> coeffs = wavedecn(np.ones((4, 4, 4)), 'db1')
    >>> # Levels:
    >>> len(coeffs)-1
    2
    >>> waverecn(coeffs, 'db1')  # doctest: +NORMALIZE_WHITESPACE
    array([[[ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.]],
           [[ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.]],
           [[ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.]],
           [[ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.],
            [ 1.,  1.,  1.,  1.]]])

    """
    if len(coeffs) < 1:
        raise ValueError(
            "Coefficient list too short (minimum 1 array required).")

    a, ds = coeffs[0], coeffs[1:]

    # Raise error for invalid key combinations
    ds = list(map(_fix_coeffs, ds))

    if not ds:
        # level 0 transform (just returns the approximation coefficients)
        return coeffs[0]
    if a is None and not any(ds):
        raise ValueError("At least one coefficient must contain a valid value.")

    coeff_ndims = []
    if a is not None:
        a = np.asarray(a)
        coeff_ndims.append(a.ndim)
    for d in ds:
        coeff_ndims += [v.ndim for k, v in d.items()]

    # test that all coefficients have a matching number of dimensions
    unique_coeff_ndims = np.unique(coeff_ndims)
    if len(unique_coeff_ndims) == 1:
        ndim = unique_coeff_ndims[0]
    else:
        raise ValueError(
            "All coefficients must have a matching number of dimensions")

    if np.isscalar(axes):
        axes = (axes, )
    if axes is None:
        axes = range(ndim)
    else:
        axes = tuple(axes)
    if len(axes) != len(set(axes)):
        raise ValueError("The axes passed to waverecn must be unique.")
    ndim_transform = len(axes)

    for idx, d in enumerate(ds):
        if a is None and not d:
            continue
        # The following if statement handles the case where the approximation
        # coefficient returned at the previous level may exceed the size of the
        # stored detail coefficients by 1 on any given axis.
        if idx > 0:
            a = _match_coeff_dims(a, d)
        d['a' * ndim_transform] = a
        a = idwtn(d, wavelet, mode, axes)

    return a