def calculateEntropy(self,Y, mship):
        """
            calculates the split entropy using Y and mship (logical array) telling which 
            child the examples are being split into...

            Input:
            ---------
                Y: a label array
                mship: (logical array) telling which child the examples are being split into, whether
                        each example is assigned to left split or the right one..
            Returns:
            ---------
                entropy: split entropy of the split
        """

        lexam=Y[mship]
        rexam=Y[np.logical_not(mship)]

        pleft= len(lexam) / float(len(Y))
        pright= 1-pleft

        pl= stats.itemfreq(lexam)[:,1] / float(len(lexam)) + np.spacing(1)
        pr= stats.itemfreq(rexam)[:,1] / float(len(rexam)) + np.spacing(1)

        hl= -np.sum(pl*np.log2(pl)) 
        hr= -np.sum(pr*np.log2(pr)) 

        sentropy = pleft * hl + pright * hr

        return sentropy
Exemple #2
0
def dll_type_2(tree, name_only = False):
    dll_list = {"name": [], "addr": []} 

    for el in tree.iter():
        # obtain DLL target in element
        if el.tag == "load_dll":
            dll_name = el.get('filename')
            #split filename and file_address
            key_bag = dll_name.split("\\")
            
            dll_name = key_bag[len(key_bag) - 1]
            dll_addr = "//".join(key_bag[:(len(key_bag) - 1)])
            #TODO: convert to lower case

            dll_list["name"].append(dll_name)
            dll_list["addr"].append(dll_addr)

    dll_list["name"] = stats.itemfreq(dll_list["name"])
    dll_list["addr"] = stats.itemfreq(dll_list["addr"])    
    dll_list_join = concatenate([dll_list["name"], dll_list["addr"]])

    dll_name_counter = Counter()
    for item in dll_list_join: 
        dll_name_counter[item[0]] = int(item[1])

    return dll_name_counter
Exemple #3
0
def land_sic_overlap_timeseries(instrument,
                                title="Land-Sea Ice Border Variations"):
    """
        Time Series that shows the percentage variations of the land mask
        border given the expansion of sea ice in VIRS.
    """

    files = data.file_names(instrument_id=data.INSTRUMENT_MAP.get(instrument))
    out = []

    for idx, mat in enumerate(data.mat_generator(files)):

        sic = SIC(files[idx])
        lm = LM(files[idx])

        sic_surface = sic.surface(boolean=False)
        lm_surface = lm.silhoutte()

        silhoutte_freq = itemfreq(lm_surface)
        border = silhoutte_freq[1][1]

        merge = np.add(sic_surface, lm_surface)
        merge_freq = itemfreq(merge)
        intercept = merge_freq[2][1]

        land_ice_overlap = (float(intercept) / border) * 100
        temp = {'timestamp': lm.title, 'intercept': land_ice_overlap}
        out.append(temp)

    index = [elem['timestamp'] for elem in out]
    df = DataFrame(out, index=index)
    sdf = df.sort_values(by='timestamp')
    sdf.plot(title=title)
    plt.show()
Exemple #4
0
def getDomColour(image):
    # use k-means clustering to create palette with the n_colours=10 most representative colours of the image
    arr = np.float32(image)
    pixels = arr.reshape((-1, 3))

    n_colours = 5
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 25, 0.5)
    flags = cv2.KMEANS_RANDOM_CENTERS
    _, labels, centroids = cv2.kmeans(pixels, n_colours, None, criteria, 10,
                                      flags)

    palette = np.uint16(centroids)

    #     quantized = palette[labels.flatten()]
    #     quantized = quantized.reshape(image.shape)

    # the dominant colour is the palette colour which occurs most frequently on the quantised image:

    index_domcol = np.argmax(itemfreq(labels)[:, -1])
    domcol = palette[index_domcol]
    freq_domcol = itemfreq(labels)[:, 1][index_domcol]
    interestingness_domcol = howInteresting(domcol)
    lightness_domcol = getLightness(domcol)
    dominance_domcol = interestingness_domcol * freq_domcol * lightness_domcol
    index = 0
    for colour in palette:
        interestingness = howInteresting(colour)
        freq = itemfreq(labels)[:, 1][index]
        lightness = getLightness(colour)
        dominance = interestingness * freq * lightness
        if dominance > dominance_domcol is not (isDark(colour)):
            domcol = palette[index]
    return domcol
Exemple #5
0
    def calculateEntropy(self, Y, mship):
        """
            calculates the split entropy using Y and mship (logical array) telling which 
            child the examples are being split into...

            Input:
            ---------
                Y: a label array
                mship: (logical array) telling which child the examples are being split into, whether
                        each example is assigned to left split or the right one..
            Returns:
            ---------
                entropy: split entropy of the split
        """

        lexam = Y[mship]
        rexam = Y[np.logical_not(mship)]

        pleft = len(lexam) / float(len(Y))
        pright = 1 - pleft

        pl = stats.itemfreq(lexam)[:, 1] / float(len(lexam)) + np.spacing(1)
        pr = stats.itemfreq(rexam)[:, 1] / float(len(rexam)) + np.spacing(1)

        hl = -np.sum(pl * np.log2(pl))
        hr = -np.sum(pr * np.log2(pr))

        sentropy = pleft * hl + pright * hr

        return sentropy
def plotStratification(gCNdata,tCNdata,tRnaData,newGeneName,nameGOI,theAxes,strat):
    # color, shape, and alpha schemes for the stratification
    if strat == 1:
        myColorScheme = ['c','b','g','r','m']*5
        myShapeScheme = ['<']*5+['v']*5+['o']*5+['^']*5+['>']*5
        myAlphaScheme = [0.1]*5+[0.1]*5+[0.1]*5+[0.1]*5+[0.1]*5
    elif strat == 2:
        myColorScheme = ['c']*5 + ['b']*5 + ['g']*5 + ['r']*5 + ['m']*5
        myShapeScheme = ['<','v','o','^','>']*5
        myAlphaScheme = [0.1]*5+[0.1]*5+[0.1]*5+[0.1]*5+[0.1]*5
    
    sumCNdata = 10*tCNdata + 2*gCNdata
    theColorCn = sumCNdata + 24
    colorDist = ss.itemfreq(theColorCn)
    colorDist = colorDist[:,0]
    for level in colorDist:
        thisIndex = level/2
        theAxes.scatter(sumCNdata[theColorCn == level],tRnaData[theColorCn == level],s=100,alpha=0.3,color=myColorScheme[int(thisIndex)],marker=myShapeScheme[int(thisIndex)])
    sumCnDist = ss.itemfreq(sumCNdata)
    sumCnLevels = sumCnDist[:,0]
    sumCnCounts = sumCnDist[:,1]
    meanCN = np.mean(gCNdata)
    stdCN = np.std(gCNdata)
    tMeanCN = np.mean(tCNdata)
    tStdCN = np.std(tCNdata)
    sumMeanExpT = [np.mean(tRnaData[sumCNdata == i]) for i in sumCnLevels]
    sumStdExpT = [np.std(tRnaData[sumCNdata == sumCnLevels[i]])/np.sqrt(sumCnCounts[i]) for i in range(len(sumCnLevels))]
    theAxes.errorbar(sumCnLevels,sumMeanExpT,sumStdExpT,marker='_',markersize=15,markeredgewidth=2,color='k',elinewidth=3,capsize=4)
    theAxes.errorbar(10*tMeanCN,min(tRnaData),xerr=10*tStdCN,marker='^',markersize=10,elinewidth=3,color='k')
    theAxes.errorbar(10*meanCN,min(tRnaData)-0.1,xerr=10*stdCN,marker='^',markersize=10,elinewidth=3,color='k')
    theAxes.set_xticks(np.arange(-25,25,5))
    theAxes.grid()
    theAxes.set_ylabel('RNA Expression of %s' %newGeneName)
Exemple #7
0
def learn_with_test(dataset, testset):
    matrix_ds = np.asarray(dataset)
    matrix_ds_test = np.asarray(testset)
    # clf = linear_model.LogisticRegression()
    training_target = matrix_ds[:, TYPE_INDEX]
    training_dataset = matrix_ds[:, 1:TYPE_INDEX].astype(np.float)
    testing_target = matrix_ds_test[:, TYPE_INDEX]
    testing_dataset = matrix_ds_test[:, 1:TYPE_INDEX].astype(np.float)

    # Parameter selection
    # Set the parameters by cross-validation

    # cv = StratifiedShuffleSplit(training_target, n_iter=5, test_size=0.2, random_state=42)
    # C_range = 10.0 ** np.arange(-3, 3)
    # gamma_range = 10.0 ** np.arange(-3, 3 )
    # param_grid = dict(gamma=gamma_range, C=C_range)

    # clf = GridSearchCV(SVC(), param_grid, cv=cv)
    # clf.fit(training_dataset, training_target)
    # print("The best parameters are %s with a score of %0.2f"
    # % (clf.best_params_, clf.best_score_))
    clf = SVC(kernel="rbf", C=10, gamma=0.1)

    clf.fit(training_dataset, training_target)
    # print clf.score(testing_dataset, testing_target)
    predictions = clf.predict(testing_dataset)
    print itemfreq(predictions)
    def test_scale(self):
        dataset = loader.load_kanade(shared=False,
                                     n=2,
                                     pre={'scale2unit': True})

        self.assertTrue(len(dataset[0]) == 2 and len(dataset[1]) == 2)

        print dataset[0]
        print itemfreq(dataset[0])
Exemple #9
0
    def get_feature_distribution(self):
        '''

        get feature distribution on given dataset.

        '''

        return ({
            'y_train': itemfreq(self.y_train),
            'y_test': itemfreq(self.y_test),
        })
def run_example(data_path):
    """
    method to demonstrate the usage of grbm.
    :param: data_path path of dataset
    :type: String
    """

    print('... loading data')

    # Load the dataset
    f = gzip.open(data_path, 'rb')
    train_set, valid_set, test_set = cPickle.load(f)
    f.close()

    X_train, Y_train = train_set
    print('train x: ', X_train.shape)
    print('train Y: ', Y_train.shape)
    #valid_set_x, valid_set_y = datasets[1]
    X_test, Y_test = test_set
    print('test X: ', X_test.shape)
    print('test Y: ', Y_test.shape)
    print('label count for training data:')
    print(itemfreq(Y_train))
    print('label count for test data:')
    print(itemfreq(Y_test))

    parameters_GRBM = [[200, 2, 10, 0.01, 0.9, 1, 'None',0.1, 0.1, 0.0],
                       [200, 100, 10, 0.01, 0.9, 1, 'L1', 0.2, 0.2, 0.0],
                       [200, 10, 10, 0.01, 0.9, 1, 'L2', 0.3, 0.3, 0.5]]

    for param_grbm in parameters_GRBM:
        grbm = GRBM(random_state=0)
        grbm.n_hidden = param_grbm[0]  
        grbm.grbm_n_iter = param_grbm[1]
        grbm.grbm_batch_size = param_grbm[2]
        grbm.grbm_learning_rate = param_grbm[3]  # fitting time
        grbm.grbm_momentum = param_grbm[4]
        grbm.grbm_n_gibbs_steps = param_grbm[5]
        grbm.penalty = param_grbm[6]
        grbm.C1 = param_grbm[7]
        grbm.C2 = param_grbm[8]
        grbm.pdrop = param_grbm[9]
        
        grbm.fit(X_train, Y_train)

        Y_pred = grbm.predict(X_test)

        score = metrics.accuracy_score(Y_test, Y_pred)

        print('Acc score for test set:', score)
        print("GRBM report:\n%s\n" % (
            metrics.classification_report(
                Y_test,
                Y_pred)))
def num_cluster (data_bipart):
    random.seed(17)
    num = len(data_bipart) /300 +1
    kmeans_bipart = KMeans(n_clusters=num, random_state=0).fit(data_bipart)
    labels_bipart = kmeans_bipart.labels_
    max_group = max(itemfreq(labels_bipart)[:,1])
    while (max_group >350):
        num += 2
        kmeans_bipart = KMeans(n_clusters=num, random_state=0).fit(data_bipart)
        labels_bipart = kmeans_bipart.labels_
        max_group = max(itemfreq(labels_bipart)[:,1])
    return num
    def segmenter(self, data, labels):
        best_impurity = float('inf')
        best_left, best_right, best_rule = None, None, None
        data_size, num_features = data.shape

        # Random forest
        if self.rf:
            for i in range(self.trees):
                m = random.sample(range(num_features), 10)
                for j in range(num_features):
                    feature = data[:, j]
                    n = random.sample(range(data_size), int(data_size / 50))
                    for val in n:
                        left_indices = np.nonzero(feature < val)[0]
                        right_indices = np.nonzero(feature >= val)[0]
                        split_rule = (j, val)
                        left_labels = labels[left_indices]
                        right_labels = labels[right_indices]
                        if left_labels.size == 0 or right_labels.size == 0:
                            continue
                        impurity = self.impurity(itemfreq(left_labels),
                                                 itemfreq(right_labels))
                        if impurity < best_impurity:
                            best_impurity = impurity
                            best_rule = split_rule
                            best_left = left_indices
                            best_right = right_indices
        # Normal DT
        else:
            for i in range(num_features):
                feature = data[:, i]
                for val in feature:
                    # mean = np.mean(feature)
                    # left_indices = np.nonzero(feature < mean)[0]
                    # right_indices = np.nonzero(feature >= mean)[0]
                    # split_rule = (i, mean)
                    left_indices = np.nonzero(feature < val)[0]
                    right_indices = np.nonzero(feature >= val)[0]
                    split_rule = (i, val)
                    left_labels = labels[left_indices]
                    right_labels = labels[right_indices]
                    impurity = self.impurity(itemfreq(left_labels),
                                             itemfreq(right_labels))
                    if left_labels.size == 0 or right_labels.size == 0:
                        continue
                    if impurity < best_impurity:
                        best_impurity = impurity
                        best_rule = split_rule
                        best_left = left_indices
                        best_right = right_indices

        return best_rule, best_left, best_right
def main():
    u_data = csv.reader(open('ml-100k/u.data', 'rb'), delimiter='\t')
    columns = list(zip(*u_data))
    # column 1: user id
    col1 = np.array(columns[0]).astype(np.int)
    # column 2: item id
    col2 = np.array(columns[1]).astype(np.int)

    # review_list[u] = a list of movies that were reviewed by user u + 1
    review_list = user_review_list(col1, col2)[1:]

    mat = np.zeros(U * (U - 1))
    cnt = 0
    for i in range(U):
        for j in range(U):
            if i != j:
                mat[cnt] = len(np.intersect1d(review_list[i], review_list[j]))
                cnt = cnt + 1

    # on average, how many movies are commonly reviewed by a user pair?
    mean = np.mean(mat)
    # median number of movies commonly reviewed by a user pair
    median = np.median(mat)
    # how many user pairs have rated that many movies?
    freq_table1 = stats.itemfreq(mat)
    maximum = freq_table1[-1, 0]
    minimum = freq_table1[0, 0]

    # display results
    print 'mean:', mean
    print 'median:', median
    # print freq_table1[:, 0]
    interval = 10
    plot_hist(freq_table1[:, 0], freq_table1[:, 1])

    # measure how many reviews each movie has
    freq_table2 = stats.itemfreq(col2)
    # which movies have the most/fewest reviews?
    most = reviews(freq_table2, np.amax)
    fewest = reviews(freq_table2, np.amin)
    # display results
    print 'movies that have the most reviews:', most[0]
    print 'number of reviews:', most[1]
    print 'movies that have the fewest reviews:', fewest[0]
    print 'number of reviews:', fewest[1]

    # sort the movies based on their number of reviews
    sorted_movies = sort_movies(freq_table2)
    # display results
    plot_line(np.arange(len(sorted_movies[:, 0])), sorted_movies[:, 1])
Exemple #14
0
 def check_subset(data, subset):
     """frequency of each element than compare them"""   
     if all(elem in data for elem in subset):
         data_freq = itemfreq(data)
         subset_freq = itemfreq(subset)
         for elem in subset_freq: 
             if elem[0] in data_freq[:, 0]:
                 itemindex = np.where(data_freq[:, 0] == elem[0])
                 if (len(elem[0]) != len(data_freq[itemindex][0][0])) or \
                         (int(data_freq[itemindex][0][1]) < int(elem[1])):
                     return False
             else:
                 return False
         return True
     return False
Exemple #15
0
def api_id():
    # Check if an ID was provided as part of the URL.
    # If ID is provided, assign it to a variable.
    # If no ID is provided, display an error in the browser.
    if 'p1' in request.args and 'p2' in request.args:  # and 'cat' in request.args:
        p1 = request.args.get('p1')
        p2 = request.args.get('p2')
        #cat = request.args.get('cat')
    else:
        return "Input error."

    # get images from URL
    ssl._create_default_https_context = ssl._create_unverified_context
    req1 = urllib.request.urlopen(p1)
    arr1 = np.asarray(bytearray(req1.read()), dtype=np.uint8)
    img1 = cv2.imdecode(arr1, cv2.IMREAD_COLOR)
    req2 = urllib.request.urlopen(p2)
    arr2 = np.asarray(bytearray(req2.read()), dtype=np.uint8)
    img2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)

    # calc color correlation
    img1blk = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
    img2blk = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
    hist1 = cv2.calcHist([img1blk], [0], None, [256], [0, 256])
    hist2 = cv2.calcHist([img2blk], [0], None, [256], [0, 256])
    colorDiff = cv2.compareHist(
        hist1, hist2,
        cv2.HISTCMP_BHATTACHARYYA)  # 0-1 higher this is, less close it is
    colorDiff = 1 - colorDiff

    # texture comparison
    lbp1 = local_binary_pattern(img1blk, 24, 3, method='uniform')
    freq1 = itemfreq(lbp1.ravel())
    text_hist1 = freq1[:, 1] / sum(freq1[:, 1])  # normalize
    lbp2 = local_binary_pattern(img2blk, 24, 3, method='uniform')
    freq2 = itemfreq(lbp2.ravel())
    text_hist2 = freq2[:, 1] / sum(freq2[:, 1])
    textDiff = cv2.compareHist(np.array(text_hist1, dtype=np.float32),
                               np.array(text_hist2, dtype=np.float32),
                               cv2.HISTCMP_BHATTACHARYYA)
    textDiff = 1 - textDiff

    # feature matching
    #sift = cv2.xfeatures2d.SIFT_create()
    #kp_1, desc_1 = sift.detectAndCompute(img1, None)
    #kp_2, desc_2 = sift.detectAndCompute(img2, None)

    return str(round(colorDiff * 0.5 + textDiff * 0.5, 2) * 100)
Exemple #16
0
def compute_histogram(data, labels):
    histogram = itemfreq(sorted(data))
    for label in labels:
        if label not in histogram[:, 0]:
            histogram = np.vstack((histogram, np.array([[label, 0]], dtype=object)))
    histogram = histogram[histogram[:, 0].argsort()]
    return histogram
def call_freq(tree, name_only=False):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'first_call-x' to 1 if x was the first system call
      made, and 'last_call-y' to 1 if y was the last system call made. 
      (in other words, it returns a dictionary indicating what the first and 
      last system calls made by an executable were.)
    """
    callz = []
    in_all_section = False
    first = True  # is this the first system call
    last_call = None  # keep track of last call we've seen
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            callz.append(el.tag)

    # finally, count the frequencies
    freqList = stats.itemfreq(callz)

    if name_only == True:
        c = set(callz)
    else:
        c = Counter()
        for item in freqList:
            c["sys_call-" + item[0]] = int(item[1])

    return c
Exemple #18
0
    def dominant_color(cls, img, k):
        """Return an RGB tuple of the dominant color in an image

        Performs k-means clustering on the image's pixels, then selects the centroid
            of the largest cluster to be the dominant color of the image

        Uses kmeans++ for cluster initialization

        :param img: The image to analyze, read in via cv2.imread()
        :param k: The number of clusters to use
        :return: The RGB tuple of the dominant color in the image
        """

        img_as_float32 = np.float32(img)
        pixels = img_as_float32.reshape((-1, 3))
        # Stop after MAX_ITER iterations OR accuracy EPS (epsilon) is reached
        criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200,
                    0.1)
        # Use kmeans++ center initialization
        flags = cv2.KMEANS_PP_CENTERS
        # Amount of times the algorithm is attempted
        attempts = 10

        _, labels, centroids = cv2.kmeans(pixels, k, criteria, attempts, flags)

        candidate_dominant_colors = np.uint8(centroids)
        # The dominant color is the cluster with the largest number of members/pixels
        dominant_color_idx = np.argmax(itemfreq(labels)[:, -1])
        dominant_color_tuple = candidate_dominant_colors[dominant_color_idx]

        d_blue = dominant_color_tuple[0]
        d_green = dominant_color_tuple[1]
        d_red = dominant_color_tuple[2]

        return (d_red, d_green, d_blue)
Exemple #19
0
    def calc_accuracy(self, x, y, method='repmet'):
        """
        Calculate the accuracy of reps based on the current clusters
        :param x:
        :param y:
        :param method:
        :return:
        """

        if method == 'unsupervised':
            # Tries finding a cluster for each class, and then assigns cluster labels to each cluster based on the max
            # samples of a particular class in that cluster
            k = np.unique(y).size
            kmeans = KMeans(n_clusters=k, max_iter=35, n_init=15,
                            n_jobs=-1).fit(x)
            emb_labels = kmeans.labels_
            G = np.zeros((k, k))
            for i in range(k):
                lbl = y[emb_labels == i]
                uc = itemfreq(lbl)
                for uu, cc in uc:
                    G[i, uu] = -cc
            A = linear_assignment_.linear_assignment(G)
            acc = 0.0
            for (cluster, best) in A:
                acc -= G[cluster, best]
            return acc / float(len(y))
        else:
            predictions = self.predict(x, method=method)
            correct = predictions == y
            return correct.astype(float).mean()
    def _fit_model(self, fcol, dis):
        """Determine the best fit for one feature column given distribution name

        Parameters
        ----------
        fcol: feature column, array
        dis: distribution name, String


        Returns
        ----------
        function: fit model with feature as argument

        """
        if dis == 'ratio':
            itfreq = itemfreq(fcol)
            uniqueVars = itfreq[:, 0]
            freq = itfreq[:, 1]
            rat = freq / sum(freq)
            rat = dict(zip(uniqueVars, rat.T))
            func = lambda x: self.funcs[dis](x, rat)
        if dis == 'poisson':
            lamb = np.nanmean(fcol, axis=0)
            func = lambda x: self.funcs[dis](x, lamb)
        if dis == 'norm':
            sigma = np.nanvar(fcol, axis=0)
            theta = np.nanmean(fcol, axis=0)
            func = lambda x: self.funcs[dis](x, sigma, theta)
        return np.vectorize(func)
Exemple #21
0
 def ClusterSizes(self):
     """Returns an array containing the number of points in each cluster."""
     if not any(self.__clsizes):
         self.__clsizes = np.zeros(self.NClusters())
         tmp = itemfreq(self.__ClusterID)
         self.__clsizes[tmp[:, 0]] = tmp[:, 1]
     return self.__clsizes
Exemple #22
0
def dbscan_outliers(data, genes, eps, min_samples, max_samples=1, as_json=True):
    db = DBSCAN(eps=eps, min_samples=min_samples)
    # sd_scaler = StandardScaler()
    res = dr.get_dataset_ensembl_info()
    outliers_id = []
    for g in genes:
        # scaled = sd_scaler.fit(data.loc[g, :])
        fit = db.fit(np.reshape(data.loc[g, :], (196, 1)))

        candidates = itemfreq(fit.labels_)

        try:
            class_zero = candidates[0][1]
            class_one = candidates[1][1]

            support = min(class_one, class_zero)

            if min_samples < support <= max_samples:
                info = [gene for gene in res if gene.ensemblgeneid == g][0]
                formatted_info = {"id": g, "name": info.genename, "type": info.genetype, "samples": str(support),
                                  "distance": "NA"}
                jinfo = json.dumps(formatted_info)
                jinfo += ","
                outliers_id.append(g)
                print("outlier found :" + g)
                if as_json:
                    yield (jinfo)
                else:
                    yield (formatted_info)
        except:
            pass
Exemple #23
0
def cluster_outliers(data, genes, max_samples, min_dist=0.8, mining_id=1, as_json=True):
    estimator = cluster.KMeans(2)  # init kmeans
    samples_from_perc = round(max_samples * len(data.columns) / 100)
    print(samples_from_perc)
    ens = False
    info = None

    if str(genes[0]).startswith("ENSG"):
        res = dr.get_dataset_ensembl_info()
        ens = True
    outliers_id = []
    # debug_count = 0
    if as_json:
        yield (u"{\"outliers\":[")
    for g in genes:
        # if debug_count > 10:
        # break
        try:
            gene_row = data.loc[g, :].dropna()
            gene_row = gene_row.to_frame()

            estimator.fit(gene_row)  # conversion to dframe for model fit
            candidates = itemfreq(estimator.labels_)
            class_zero = candidates[0][1]
            class_one = candidates[1][1]
            support = min(class_one, class_zero)
            majority_class = class_one > class_zero
            dist = abs(max(gene_row[estimator.labels_ == majority_class]) - max(
                gene_row[estimator.labels_ == 1 - majority_class]))
            ran = gene_row.max() - gene_row.min()

            ndist = dist / float(ran)
            print(ndist)
            if 0 < support <= samples_from_perc and min_dist < ndist < 1:
                # debug_count += 1
                if ens:
                    info = [gene for gene in res if gene.ensemblgeneid == g][0]
                    formatted_info = {"identifier": g, "name": info.genename, "type": info.genetype, "samples": str(support),
                                      "distance": str(ndist), "range": str(ran)}
                else:
                    formatted_info = {"identifier": g, "name": "Not available", "type": "Not available", "samples": str(support),
                                      "distance": str(ndist), "range": str(ran)}

                outliers_id.append(formatted_info)
                print("outlier found :" + g)
                if as_json:
                    jinfo = json.dumps(formatted_info)
                    jinfo += u","
                    yield (jinfo)
                else:
                    yield (formatted_info)
        except:
            # if there is an issue on one gene (no variation, clustering impossible) the majority class
            # selection will obviously explode, we capture that in this block and just continue with the next gene (no harm done, there are
            # no outliers when the values are the same)
            pass

    if len(outliers_id) > 0:
        pr.save_outliers(mining_id, outliers_id)
        yield(str(u"]}"))
Exemple #24
0
    def pixelize_at_target_nside(self, nside):
        # this also effectively applies the mask to the data
        self.nside = nside
        # so averages are computed in downgrade
        self.mask[self.mask == hp.UNSEEN] = 0  # step 1
        mask_targetnside = hp.pixelfunc.ud_grade(
            self.mask, pess=False, nside_out=self.nside)
        gal_index_targetnside = radec_to_index(
            self.data['DEC'], self.data['RA'], self.nside)
        mask_targetnside[mask_targetnside == 0] = hp.UNSEEN
        self.mask[self.mask == 0] = hp.UNSEEN  # undo step 1

        # prune data that's in a bad part of the mask
        self.data = self.data[mask_targetnside[
            gal_index_targetnside] != hp.UNSEEN]

        counts = itemfreq(gal_index_targetnside)
        full_map_counts = np.zeros(hp.nside2npix(self.nside))
        full_map_counts[counts[:, 0]] = counts[:, 1]

        good_counts = full_map_counts[np.where(mask_targetnside != hp.UNSEEN)]
        good_fracs = mask_targetnside[np.where(mask_targetnside != hp.UNSEEN)]

        self.nbar = np.average(good_counts, weights=good_fracs)
        print 'nbar is', self.nbar, 'galaxies per pixel'
        pixels_to_count = np.where(mask_targetnside != hp.UNSEEN)
        dec, ra = index_to_radec(pixels_to_count, self.nside)
        final_counts = full_map_counts[pixels_to_count]
        self.pixelized = (ra[0], dec[0], final_counts)
Exemple #25
0
def land_sic_overlap(lm_image, sic_image):
    """
        Show Sea Ice Concentration and Land Mask together. This figure shows
        the overlaps between mw_sic and lm.
    """

    lm = lm_image
    sic = sic_image

    sic_surface = sic.surface(boolean=False)
    lm_surface = lm.image()
    condlist = [lm_surface == 1]
    choicelist = [3]
    merge = np.add(sic_surface, np.select(condlist, choicelist))
    freqs = itemfreq(merge)

    # Pie Chart config params
    labels = "Sea Water", "Sea Ice", "Land", "Land - Sea Ice Overlap"
    colors = ["blue", "lightblue", "yellow", "red"]
    values = [freqs[0][1], freqs[1][1], freqs[2][1], freqs[3][1]]

    # Make and cofigure figure to be displayed
    fig, axes = plt.subplots(1, 2)

    fig.subplots_adjust(hspace=0.3, wspace=0.05)

    #populate each axis of the figure
    axes[0].imshow(merge)
    axes[0].set_title("Sea Ice and Land Mask")
    axes[1].pie(values, explode=[0.1, 0.1, 0.1, 0.4], labels=labels,
                colors=colors, shadow=True, autopct='%1.2f%%')
    plt.show()
    def CombinedMeanShift(self, h, alpha,
                          PrincComp=None,
                          njobs=-2,
                          mbf=1):
        """Performs the scikit-learn Mean Shift clustering.

        Arguments:

        h -- the bandwidth
        alpha -- the weight of the principal components as compared
        to the spatial data.
        PrincComp -- used to pass already-computed principal components
        njobs -- the number of processes to be used (default: n. of CPU - 1)
        mbf -- the minimum number of items in a seed"""

        MS = MeanShift(bin_seeding=True, bandwidth=h, cluster_all=True,
                       min_bin_freq=mbf, n_jobs=njobs)
        if PrincComp is None:
            PrincComp = self.ShapePCA(2)
        print("Starting sklearn Mean Shift... ")
        stdout.flush()
        fourvector = np.vstack((self.__data, alpha * PrincComp))
        MS.fit_predict(fourvector.T)
        self.__ClusterID = MS.labels_
        self.__c = MS.cluster_centers_.T
        self.__clsizes = itemfreq(self.__ClusterID)[:, 1]
        print("done.")
        stdout.flush()
 def ClusterSizes(self):
     """Returns an array containing the number of points in each cluster."""
     if not any(self.__clsizes):
         self.__clsizes = np.zeros(self.NClusters())
         tmp = itemfreq(self.__ClusterID)
         self.__clsizes[tmp[:, 0]] = tmp[:, 1]
     return self.__clsizes
Exemple #28
0
def lbf(infile):
    result = []
    label = []
    ix = 0

    for here, i in enumerate(open(infile).readlines()):
        ix +=1
        imgpath, l = i.split(',')
        if os.path.exists(imgpath):

            im = cv2.imread(imgpath)
            im  = cv2.resize(im, (200, 200))
            im_gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
            radius = 3
            no_points = 8 * radius
            lbp = local_binary_pattern(im_gray, no_points, radius, method='uniform')
            x = itemfreq(lbp.ravel())
            hist = x[:, 1]/sum(x[:, 1])
            result.append(hist)
            if "FE" in l:
                label.append(1) 
            else:
                label.append(-1) 
                
    print len(result)
    print len(label)
    def _fit_model(self, fcol, dis):

        """Determine the best fit for one feature column given distribution name

        Parameters
        ----------
        fcol: feature column, array
        dis: distribution name, String


        Returns
        ----------
        function: fit model with feature as argument

        """
        if dis == 'ratio':
            itfreq = itemfreq(fcol)
            uniqueVars = itfreq[:,0]
            freq = itfreq[:,1]
            rat = freq/sum(freq)
            rat = dict(zip(uniqueVars, rat.T))
            func = lambda x: self. funcs[dis](x, rat)
        if dis == 'poisson':
            lamb = np.nanmean(fcol, axis = 0)
            func = lambda x: self.funcs[dis](x, lamb)
        if dis ==   'norm':
            sigma = np.nanvar(fcol, axis=0)
            theta = np.nanmean(fcol, axis = 0)
            func = lambda x: self.funcs[dis](x, sigma, theta)
        return np.vectorize(func)
Exemple #30
0
    def calculateLBP(self):
        paramList = list()
        with open(self.paramTxt) as f:
            for line in f:
                paramList.append(int(line.strip()))
        print(paramList)
        for image in self.trainDict.iterkeys():
            print(image)
            img = cv2.imread(image)
            imgGray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            # radius = 3
            # noPoints = 8 * radius
            radius = paramList[0]
            noPoints = paramList[1] * radius
            print(radius)
            print(noPoints)
            lbpImage = local_binary_pattern(imgGray, noPoints, radius, method='uniform')

            # Calculate the histogram
            x = itemfreq(lbpImage.ravel())
            # normalize the histogram
            hist = x[:, 1] / sum(x[:, 1])

            # hist = cv2.calcHist(lbp, [0], None, [256], [0, 256])
            # cv2.normalize(hist,hist)
            # hist = hist.flatten()

            self.addrImg.append(image)
            self.lbpHistogram.append(hist)
            self.tagNo.append(self.trainDict.get(image))
            joblib.dump((self.addrImg, self.lbpHistogram, self.tagNo), "lbp.pkl", compress=3)
Exemple #31
0
    def predict(self):
        print "Predicting"
        predicted_labels = np.zeros(len(self.test_data))
        for i in range(0, len(self.test_data)):
            # calculate the distance between this target point and all train data
            dist = np.linalg.norm(self.train_data - self.test_data[i],
                                  axis=1,
                                  ord=2)

            # find k smallest distance from train. This outputs a list of (index, distance)
            smallest_k_distances_index_pair = hq.nsmallest(self.k,
                                                           enumerate(dist),
                                                           key=lambda d: d[1])

            # extract the labels
            nearest_labels = [
                self.train_label[pair[0]]
                for pair in smallest_k_distances_index_pair
            ]
            majority_label = max(set(nearest_labels), key=nearest_labels.count)
            predicted_labels[i] = majority_label
            print majority_label
            # populate frequency table
            freq = itemfreq(nearest_labels)
            self.label_frequency_table[i, :][freq[:, 0].astype(
                dtype=int)] = freq[:, 1]
        return predicted_labels
Exemple #32
0
  def get_model_prediction(self, clf, where=None):
    with stopwatch("getting model predictions"):
      with pd.HDFStore(self.store_path, mode='a') as store:
        n_rows = len(store.select_as_coordinates(self.tables[0],
                                                 where=where)
                     )

        chunksize = n_rows // 1

        new_table_name = self.tables[0] + "_pred"
        if new_table_name in store.keys():
          store.remove(new_table_name)

        for chunk in store.select(self.tables[0],
                                  chunksize=chunksize,
                                  ):
          indexer = chunk.ANH
          y_pred = clf.predict(chunk.loc[indexer,self.feat_cols])
          chunk.loc[indexer, "predicted_label"] = \
            self.label_encoder.inverse_transform(y_pred)
          freq = stats.itemfreq(self.label_encoder.inverse_transform(y_pred))
          for f in freq:
            print(f[0], f[1])

          store.append(new_table_name,
                       chunk,
                       format='table',
                       append=False)
          return y_pred
Exemple #33
0
Fichier : test.py Projet : ebVu/AI
def main_breeds(labels_raw, Nber_breeds, all_breeds='TRUE'):
    labels_freq_pd = itemfreq(
        labels_raw["breed"]
    )  # get frequency of each label, shape(120, 2) 120 label
    #arg_sort() will find index of given array to sort via low to high order,
    #Example: df = [9, 3, 5, 3, 1] => df.arg_sort() = [4, 1, 3, 2, 0]
    # verify by df[df.arg_sort()]

    labels_freq_pd = labels_freq_pd[labels_freq_pd[:, 1].argsort()
                                    [::-1]]  # ger freq from high to low

    if all_breeds == 'FALSE':
        main_labels = labels_freq_pd[:, 0][
            0:Nber_breeds]  # get 8 first item of columm (breed type)
    else:
        main_labels = labels_freq_pd[:, 0][:]

    labels_raw_np = labels_raw["breed"].as_matrix(
    )  # convert series of breed type to array, shape as 10222
    labels_raw_np = labels_raw_np.reshape(
        labels_raw_np.shape[0], 1)  # convert to array 2D shape (102222, 1)

    # below result returned contains 2 array with length 922, 1 array for index of labels_raw_np
    # and 1 array for index of main_labels which has same length and where value is same
    labels_filtered_index = np.where(labels_raw_np == main_labels)

    return labels_filtered_index
Exemple #34
0
    def _predict(self, candidate_mask):
        """ Generate prediction vectors for the unlabelled candidates. """
        n_samples = len(self.pool[candidate_mask])
        n_classes = len(self.committee.classes_)
        avg_probs = np.zeros((n_samples, n_classes))
        prob_list = []
        class_freq = itemfreq(self.labels[~self.labels.mask])

        for member in self.committee.estimators_:
            member_prob = member.predict_proba(self.pool[candidate_mask])
            member_n_classes = member_prob.shape[1]

            if n_classes == member_n_classes:
                avg_probs += member_prob
                prob_list.append(member_prob)

            else:
                member_classes = class_freq[:,1].argsort()[::-1]
                member_classes = member_classes[:member_n_classes]
                full_member_prob = np.zeros((n_samples, n_classes))
                full_member_prob[:, member_classes] += member_prob[:, range(member_n_classes)]
                avg_probs += full_member_prob
                prob_list.append(full_member_prob)

        # average out the probabilities
        avg_probs /= len(self.committee.estimators_)

        return (avg_probs, prob_list)
Exemple #35
0
    def CombinedMeanShift(self, h, alpha,
                          PrincComp=None,
                          njobs=-2,
                          mbf=1):
        """Performs the scikit-learn Mean Shift clustering.

        Arguments:

        h -- the bandwidth
        alpha -- the weight of the principal components as compared
        to the spatial data.
        PrincComp -- used to pass already-computed principal components
        njobs -- the number of processes to be used (default: n. of CPU - 1)
        mbf -- the minimum number of items in a seed"""

        MS = MeanShift(bin_seeding=True, bandwidth=h, cluster_all=True,
                       min_bin_freq=mbf, n_jobs=njobs)
        if PrincComp is None:
            PrincComp = self.ShapePCA(2)
        print("Starting sklearn Mean Shift... ")
        stdout.flush()
        fourvector = np.vstack((self.__data, alpha * PrincComp))
        MS.fit_predict(fourvector.T)
        self.__ClusterID = MS.labels_
        self.__c = MS.cluster_centers_.T
        self.__clsizes = itemfreq(self.__ClusterID)[:, 1]
        print("done.")
        stdout.flush()
def replace_lower_by_higher_prob(s,p0=0.3):
    # input: s: 1D numpy array ; threshold p0
    # output: s in which element having p < p0 were placed by elements with p > p0, according to prob
    
    f = itemfreq(s)
    # element and number of occurence
    a,p = f[:,0],f[:,1].astype(float)

    # probabilities    
    p /= float(p.sum())

    # find elements having p > p0:
    iapmax = np.argwhere(p>p0).reshape((-1,))  # position
                        
    apmax = a[iapmax].reshape((-1,))           # name of aminoacid
    pmax = p[iapmax].reshape((-1,))            # probability
            
    # find elements having p < p0
    apmin = a[np.argwhere(p < p0)].reshape((-1,))

    if apmin.shape[0] > 0:
        for a in apmin:
            ia = np.argwhere(s==a).reshape((-1,))
            for iia in ia:
                s[iia] = value_with_prob(apmax,pmax)
            
    return s
Exemple #37
0
    def _predict(self, candidate_mask):
        """ Generate prediction vectors for the unlabelled candidates. """
        n_samples = len(self.pool[candidate_mask])
        n_classes = len(self.committee.classes_)
        avg_probs = np.zeros((n_samples, n_classes))
        prob_list = []
        class_freq = itemfreq(self.labels[~self.labels.mask])

        for member in self.committee.estimators_:
            member_prob = member.predict_proba(self.pool[candidate_mask])
            member_n_classes = member_prob.shape[1]

            if n_classes == member_n_classes:
                avg_probs += member_prob
                prob_list.append(member_prob)

            else:
                member_classes = class_freq[:, 1].argsort()[::-1]
                member_classes = member_classes[:member_n_classes]
                full_member_prob = np.zeros((n_samples, n_classes))
                full_member_prob[:,
                                 member_classes] += member_prob[:,
                                                                range(
                                                                    member_n_classes
                                                                )]
                avg_probs += full_member_prob
                prob_list.append(full_member_prob)

        # average out the probabilities
        avg_probs /= len(self.committee.estimators_)

        return (avg_probs, prob_list)
def export_corpus(corpus, outfolder, context_type='document'):
    """
    Converts a vsm.corpus.Corpus object into a lda-c compatible data file.
    Creates two files:
    1.  "vocab.txt" - contains the integer-word mappings
    2.  "corpus.dat" - contains the corpus object in the format described in 
        [lda-c documentation](http://www.cs.princeton.edu/~blei/lda-c/readme.txt):

            Under LDA, the words of each document are assumed exchangeable.  Thus,
            each document is succinctly represented as a sparse vector of word
            counts. The data is a file where each line is of the form:
        
                [M] [term_1]:[count] [term_2]:[count] ...  [term_N]:[count]
        
            where [M] is the number of unique terms in the document, and the
            [count] associated with each term is how many times that term appeared
            in the document.  Note that [term_1] is an integer which indexes the
            term; it is not a string.

    :param corpus: VSM Corpus object to convert to lda-c file
    :type corpus: vsm.corpus.Corpus

    :param outfolder: Directory to output "vocab.txt" and "corpus.dat"
    :type string: path
    """
    if not os.path.exists(outfolder):
        os.makedirs(outfolder)

    vocabfilename = os.path.join(outfolder, 'vocab.txt')
    with codecs.open(vocabfilename,'w','utf8') as vocabfile:
        for word in corpus.words:
            vocabfile.write(word+'\n')

    corpusfilename = os.path.join(outfolder, 'corpus.dat')
    corpusitemnames = os.path.join(outfolder,'names.dat')

    #print "METADATA",len(corpus.view_metadata(context_type))
    #print len(corpus.view_contexts(context_type))
    #vw_ctx = corpus.view_contexts(context_type)
    #vw_mtd = corpus.view_metadata(context_type)
    #for i,item in enumerate(vw_mtd):
    #    if i < 1: 
    #        print vw_mtd[i][1],vw_mtd[i][0],len(vw_ctx[i])
    #    else:
    #        print vw_mtd[i][1], vw_mtd[i][0]-vw_mtd[i-1][0],len(vw_ctx[i])


    #vw_mtd = corpus.view_metadata(context_type)
    #names_file = open(corpusitemnames,'w')
    with open(corpusfilename,'w') as corpusfile:
        for i,ctx in enumerate(corpus.view_contexts(context_type)):
            M = len(np.unique(ctx))
            corpusfile.write("{0}".format(M))
            #names_file.write("{0} {1}\n".format(vw_mtd[i][1],vw_mtd[i][0]))

            for token in itemfreq(ctx):
                corpusfile.write(" {term}:{count}".format(
                    term=token[0],count=token[1]))

            corpusfile.write("\n")
Exemple #39
0
def p_list(connpnts,p):
    list = []
    freq = itemfreq(connpnts)
    for i in range(freq.shape[0]):
        if freq[i][1] == p:
           list.append(i)
    return list
Exemple #40
0
    def generateHist(self):
        temp = []
        for x in self.unfilledp_percent:
            temp.append(hist(x))
 
        up = np.array(temp)
        print "unfilled:"
        print itemfreq(up)

        ftemp = []
        for x in self.filledp_percent:
           ftemp.append(hist(x))

        fup = np.array(ftemp)
        print "filled:"
        print itemfreq(fup)
Exemple #41
0
    def _unscheduled_penalty(self):
        """
        Each course has a predetermined amount of lectures that must
        be given. As many of these lectures as possible must be sched-
        uled. Each course that has a lecture which is not scheduled gives
        a penalty of 10 points.

        A whole individual is checked in order to calculate the penalty
        """

        individual = self.schedule
        penalty = 10
        value = 0

        occurrences_scheduled = dict(
            (entry[0], entry[1]) for entry in itemfreq(individual.flatten()))
        occurrences_desired = dict(
            (int(course[1:]), info["number_of_lectures"])
            for (course, info) in self.data["courses"].iteritems())

        for key in occurrences_desired:
            value += occurrences_desired[key]
            if key in occurrences_scheduled:
                value -= occurrences_scheduled[key]

        return value * penalty
def compare_position_booking(booked, clicked):
    arr_position = np.load("data_numpy/train_position.npy")
    for position in itemfreq(arr_position)[:,0]:
        booked_subset = booked[ arr_position == position ]
        num_of_booked = np.sum(booked_subset)
        num_of_instances = len(booked_subset)
        print position, num_of_booked, num_of_instances, num_of_booked/float(num_of_instances)*100.0
Exemple #43
0
def lbp(im_gray):

    """Returns LBP histogram of an image"""
    
    global SIZE, WINDOW, UNIFORM_PATTERNS, BASE
    im_gray = cv2.resize(im_gray, (SIZE, SIZE))
    lbp_hist = np.array([])
    for i1 in range(0,SIZE,WINDOW):
        for j1 in range(0,SIZE,WINDOW):
            box = im_gray[j1:j1+WINDOW, i1:i1+WINDOW]
#            figure()
#            imshow(box)
#            gray()
#            show()
            lbp = my_lbp(box)
#            print lbp.shape
            lbp = lbp.ravel()
            map_array = np.zeros((lbp.shape[0] + 1))
            i = 0
            for x in np.nditer(lbp):
                try:
                    map_array[i] = np.where(UNIFORM_PATTERNS==x)[0][0]            
                except:
                    map_array[i] = 58
                i += 1
            map_array = np.concatenate((map_array, BASE))
            x = itemfreq(map_array)
            hist = np.array(x[:,1] - 1).astype('int')
        #    print x
        #    print type(x)
        #    print sum(hist)
#            print hist
            lbp_hist = np.concatenate((lbp_hist, hist))
    return lbp_hist
Exemple #44
0
def out_put_pixel():
    im = Image.open('33/beer2.png')
    im_data = np.array(list(im.getdata()))
    #print(list(im.getdata()))
    #print(len(list(im.getdata())))
    #print(im_data) #[ 1 43  7 ... 19  1  7]
    #print(im_data.shape) #(19044,)
    #print(im.getpixel((0,0)))
    #print(im.getpixel((0,1)))

    im_data_stat = itemfreq(im_data)
    #pprint(im_data_stat)
    #pprint(im_data_stat[:, 1])
    #print(im_data_stat.shape)
    #pprint([i for i in np.cumsum(im_data_stat[:, 1])])
    pprint([np.sqrt(i) for i in np.cumsum(im_data_stat[:, 1])])

    for i in range(im_data_stat.shape[0] - 1, 0, -2):
        newIm_data = im_data[np.where(im_data <= im_data_stat[i, 0])]
        idx_0 = np.where(newIm_data == newIm_data.max())
        idx_1 = np.where(newIm_data != newIm_data.max())
        newIm_data[idx_0] = 0
        newIm_data[idx_1] = 1
        size = int(np.sqrt(len(newIm_data)))
        newIm = Image.new('1', (size, size))
        newIm.putdata(newIm_data)
        newIm.save('33/%i.png' % i)
Exemple #45
0
    def evaluate(self, X):
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        flatX = []
        for x in X:
            for xi in x:
                flatX.append(xi)
        flatX = np.array(flatX)
        counts = flatX.size
        items = np.unique(flatX)
        itemfreqs = itemfreq(flatX)

        freqitems = itemfreqs[itemfreqs[:, 1] >= counts * self.min_support][:,
                                                                            0]
        freqs = np.array(freqitems, dtype=np.object)

        itemnum = 1
        while (itemnum <= len(freqitems)):
            candidates = self._get_candidates(freqs, freqitems, itemnum)
            itemnum += 1
            if len(candidates) == 0:
                break
            for candidate in candidates:
                count = 0
                for x in X:
                    idx = 0
                    for xi in x:
                        if xi == candidate[idx]:
                            idx += 1

                    if idx == itemnum:
                        count += 1
                if count >= counts * self.min_support:
                    freqs.append(candidate)
        print freqs
def get_x_train(x_raw_train_input, y_val):
    x_train = np.empty([0,BINS+26+BINS], int)
    y_train = np.empty([0], str)
    
    for x_raw_train in x_raw_train_input[0:SAMPLES]:
        # Build grayscale hist feature data
        x_train_gray_scale = x_raw_train[0] 
        #hist2 = plt.hist(x, bins=BINS)
        hist_gs = np.histogram(x_train_gray_scale, bins=BINS)
        x_reduced = hist_gs[0].reshape(1,-1)

        # Build LBP hist feature data
        lbp = x_raw_train[1]
        x = itemfreq(lbp.ravel())
        hist_lbp = x[:, 1]/sum(x[:, 1])
        x_reduced = np.append(x_reduced, hist_lbp.reshape(1, hist_lbp.shape[0]))[np.newaxis]
    
        # Build HSV hist feature data
        x_train_hsv = x_raw_train[2]
        hist_hsv = np.histogram(x_train_hsv[:,2].ravel(), bins=BINS)
        x_reduced = np.append(x_reduced, hist_hsv[0].reshape(1,-1))[np.newaxis]
    
        x_train = np.append(x_train, x_reduced, axis=0)
        y_train = np.append(y_train, y_val)

    return x_train, y_train
Exemple #47
0
def call_freq(tree, name_only = False):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'first_call-x' to 1 if x was the first system call
      made, and 'last_call-y' to 1 if y was the last system call made. 
      (in other words, it returns a dictionary indicating what the first and 
      last system calls made by an executable were.)
    """
    callz = []
    in_all_section = False
    first = True # is this the first system call
    last_call = None # keep track of last call we've seen
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            callz.append(el.tag)

    # finally, count the frequencies
    freqList = stats.itemfreq(callz)   
    
    if name_only == True:        
        c = set(callz)
    else: 
        c = Counter()
        for item in freqList: c["sys_call-" +item[0]] = int(item[1])

    return c
def plotGaussian(X, y, obj, featureNames):
	"""Plot Gausian fit on top of X.
	"""
	save_path = '../MSPrediction-Python/plots/'+obj+'/'+'BayesGaussian2'
	clf = classifiers["BayesGaussian2"]
	clf,_,_ = fitAlgo(clf, X,y, opt= True, param_dict = param_dist_dict["BayesGaussian2"])
	unique_y = np.unique(y)
	theta = clf.theta_
	sigma = clf.sigma_
	class_prior = clf.class_prior_
	norm_func = lambda x, sigma, theta: 1 if np.isnan(x) else -0.5 * np.log(2 * np.pi*sigma) - 0.5 * ((x - theta)**2/sigma) 
	norm_func = np.vectorize(norm_func)
	n_samples = X.shape[0]
	for j in range(X.shape[1]):
		fcol = X[:,j]
		jfeature = featureNames[j]
		jpath = save_path +'_'+jfeature+'.pdf'
		fig = pl.figure(figsize=(8,6),dpi=150)
		for i, y_i in enumerate(unique_y):
			fcoli = fcol[y == y_i]
			itfreq = itemfreq(fcoli)
			uniqueVars = itfreq[:,0]
			freq = itfreq[:,1]
			freq = freq/sum(freq)
			the = theta[i, j]
			sig = sigma[i,j]
			pred = np.exp(norm_func(uniqueVars, sig, the))
			pl.plot(uniqueVars, pred, label= str(y_i)+'_model')
			pl.plot(uniqueVars, freq, label= str(y_i) +'_true')
		pl.xlabel(jfeature)
		pl.ylabel("density")
		pl.legend(loc='best')
		pl.tight_layout()
		# pl.show()
		fig.savefig(jpath)
Exemple #49
0
def mating_selection(population, Range, n):
    """
    Mating selection in RSEA
    :param population: current population
    :param n: number of selected individuals
    :param Range: the range of the objective vectors
    :return: next generation population
    """
    pop_obj = population[1]
    N = np.shape(pop_obj)[0]
    pop_obj = (pop_obj - np.tile(Range[0], (N, 1))) / \
              np.tile(Range[1] - Range[0], (N, 1))
    con = np.sqrt(np.sum(pop_obj**2, axis=1))
    site, _ = radar_grid(pop_obj, np.ceil(np.sqrt(N)))
    crowd_g = itemfreq(site)[:, 1]

    mating_pool = np.zeros(np.ceil(N / 2).astype(int) * 2)
    grids = tournament(2, len(mating_pool), crowd_g.reshape((crowd_g.size, 1)))
    for i in range(len(mating_pool)):
        current = np.nonzero(site == grids[i])[0]
        if current is None:
            mating_pool[i] = np.random.randint(0, N, 1)
        else:
            parents = current[np.random.randint(0, len(current), 4)]
            best = np.argmin(con[parents])
            mating_pool[i] = parents[best]
    return mating_pool.astype(int)
Exemple #50
0
                 def dP(dml,m0,m0_min,m0_max):
                     ProbL=[] 
                     m0L=[]     
                     j=0
                     for m0 in np.arange( m0_min, m0_max+0.10,0.10):
 
                         m0 = round(m0,2)
                         dml = np.array(dml)
                         data1f = np.array(dml[dml<m0])
                         xf,list1f = ecdf(data1f,norm=False)
                         
                         # USING scipystats itemfreq function:
                         freq = itemfreq(xf)
                         # taking the number of occurences (itemfreq has 2 outputs; Column 1 contains sorted, unique values from data
                         #                                , column 2 contains their respective counts.):
                         counts1 = freq[:,1]
                         prob1 = np.sum(counts1/( len(dml) ) )
                         probt = [ prob1 for x in [j] ]
                         dplist = [(probt)]
                         ProbL.extend(dplist)
                         
                         mt = [ m0 for x in [j] ]
                         m0list = [(mt)]
                         m0L.extend(m0list)
                         j+=1   
                         
                     if (INIT==1):
                         global kk
                         kk=1
                         cdfs(data1f)
                         
                     return m0L,ProbL
Exemple #51
0
def plot_histogram(times, time_range, t_step=1.):
    """
    For visual comparison to Poisson distribution
    """

    counts = count_events(times, time_range, t_step=t_step)[0]
    observed = np.array(stats.itemfreq(counts))
    bins = np.arange(np.ceil(1.5 * observed[-1, 0]))
    ideal = len(times) / counts.mean() * stats.poisson.pmf(bins, counts.mean())

    color_cycle = plt.gca()._get_lines.color_cycle
    plt.vlines(bins,
               0,
               ideal,
               label='Poisson',
               color=next(color_cycle),
               lw=12,
               alpha=0.3)
    plt.vlines(observed[:, 0],
               0,
               observed[:, 1],
               label='observed',
               color=next(color_cycle),
               lw=4)
    plt.xlim(bins[0], bins[-1])
def silencePerClass(y_pred, y_GT, classesDict, silenceClassNum):
    """
    Calculate and print percentage of silence for each class

    """
    silenceCountPerClass = np.zeros(len(classesDict)/2)
    
    for i in range(y_pred.shape[0]):
        if y_pred[i] == silenceClassNum:
            # increment each class that was the ground truth for that point: 
            for j in range(y_GT.shape[1]):
                if int(y_GT[i,j]) != -1:
                    # ignore invalid (-1) class entries
                    silenceCountPerClass[int(y_GT[i,j])] += 1
    
    gtFreq = itemfreq(y_GT.flat).astype(int)

    for i in range(silenceCountPerClass.shape[0]):
        if i in gtFreq[:,0]:
            silencePercentage = round(silenceCountPerClass[i] / float(
            gtFreq[np.where(gtFreq[:,0] == i)[0][0], 1]) * 100, 2)
            print("Class " + classesDict[i] + " contains " + str(silencePercentage) + 
            "% silence")
    
    print("-----")
Exemple #53
0
 def GetFreqsAttns(self, freqTuningHisto):  # Frequency Tuning Curve method
     """ Helper method for ShowSTH() to organize the frequencies in ascending order separated for each intensity.
     :param freqTuningHisto: dict of pandas.DataFrames with spike data
     :type freqTuningHisto: str
     :returns: ordered list of frequencies (DataFrame keys())
               numpy array of frequencies
               numpy array of intensities
     """
     freqs = np.array([])
     attns = np.array([])
     for histoKey in list(freqTuningHisto):
         if histoKey != 'None_None':
             freq = histoKey.split('_')[0]
             freqs = np.hstack([freqs, float(freq) / 1000])
             attn = histoKey.split('_')[1]
             attns = np.hstack([attns, float(attn)])
     attnCount = stats.itemfreq(attns)
     freqs = np.unique(freqs)
     attns = np.unique(attns)
     if np.max(attnCount[:, 1]) != np.min(attnCount[:, 1]):
         abortedAttnIdx = np.where(
             attnCount[:, 1] != np.max(attnCount[:, 1]))
         attns = np.delete(attns, abortedAttnIdx)
     orderedKeys = []
     for attn in attns:
         freqList = []
         for freq in freqs:
             key = str(int(freq * 1000)) + '_' + str(int(attn))
             freqList.append(key)
         orderedKeys.append(freqList)
     return orderedKeys, freqs, attns
def removeInvalids(y_pred, y_GT, silenceClassNum):
    """
    Remove all points from the prediction and the ground truth array where
    not ground truth was provided or where the point was silent
    
    @return: y_pred, y_GT: the updated arrays
    """
    # Positions where no silence predicted:
    maskValid = (y_pred != silenceClassNum)

    y_GT = y_GT[maskValid]
    y_pred = y_pred[maskValid]

    # We also ignore those samples where not ground truth was provided, 
    # i.e. we delete those entries from the GT and the prediction array:
    invalidRow = np.array([-1,-1,-1,-1,-1]) # has to match the max allow GT labels per points
    maskValid = ~np.all(y_GT==invalidRow,axis=1)

    y_GT = y_GT[maskValid]
    y_pred = y_pred[maskValid]

    # Print for how many points no ground truth was provided:
    noGtFreq = itemfreq(maskValid).astype(int)
    validCount = noGtFreq[np.where(noGtFreq[:,0] == 1)[0][0], 1]
    totalCount = sum(noGtFreq[:,1])
    percentValid = round(validCount/float(totalCount) * 100, 2)

    print("GT was provided for " + str(percentValid) + "% of all (non-silent) samples")
    print("-----")

    return y_pred, y_GT
Exemple #55
0
def search_dimer(oneframe, crit):
    #pick random residue
    nmon, natoms, topology = oneframe.n_residues, oneframe.n_atoms, oneframe.topology
    m2 = -1
    #if first guess fails, need to seek another. So make as while loop
    while True:
        m1 = int(random.random() * nmon)
        atomids = topology.select('resid ' + str(m1))
        search = [x for x in range(natoms) if x not in atomids]
        neigh = md.compute_neighbors(oneframe,
                                     crit[0],
                                     atomids,
                                     haystack_indices=search,
                                     periodic=False)
        resilist = []
        for x in neigh[0]:
            resilist.append(topology.atom(x).residue.index)
        freq = itemfreq(resilist)
        for f in freq:
            if f[1] >= crit[1]:
                m2 = f[0]
                break
        if m2 != -1: break

    #print(m1,m2)
    dimer = oneframe.atom_slice(
        topology.select('resid ' + str(m1) + ' or resid ' + str(m2)))
    atomids = topology.select('resid ' + str(m1))
    #dimer=dimer.image_molecules(inplace=False,anchor_molecules=atomids)
    return dimer
Exemple #56
0
    def getROIColors(self):
        def camelToUnderscore(word):
            new = ""
            for l in word:
                if l.isupper(): new += " %s" % l.lower()
                else: new += l
            return new

        arr = np.float32(self.ROI)
        pixels = arr.reshape((-1, 3))

        n_colors = 3
        criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200,
                    .1)
        flags = cv2.KMEANS_RANDOM_CENTERS
        _, labels, centroids = cv2.kmeans(pixels, n_colors, None, criteria, 10,
                                          flags)

        palette = np.uint8(centroids)
        quantized = palette[labels.flatten()]
        quantized = quantized.reshape(self.ROI.shape)
        dominant_color = palette[np.argmax(stats.itemfreq(labels)[:, -1])]
        r, g, b = dominant_color[2], dominant_color[1], dominant_color[0]
        dom_col = camelToUnderscore(
            colors.ColorNames.findNearestColorName(
                r, g, b, colors.ColorNames.WebColorMap))
        cr, cg, cb = colors.ColorNames.complement(r, g, b)
        com_col = camelToUnderscore(
            colors.ColorNames.findNearestColorName(
                cr, cg, cb, colors.ColorNames.WebColorMap))
        self.colors.add(dom_col)
        self.colors.add(com_col)
 def GetFreqsAttns(self, freqTuningHisto):  # Frequency Tuning Curve method
     """ Helper method for ShowSTH() to organize the frequencies in ascending order separated for each intensity.
     :param freqTuningHisto: dict of pandas.DataFrames with spike data
     :type freqTuningHisto: str
     :returns: ordered list of frequencies (DataFrame keys())
               numpy array of frequencies
               numpy array of intensities
     """
     freqs = np.array([])
     attns = np.array([])
     for histoKey in list(freqTuningHisto):
         if histoKey != 'None_None':
             freq = histoKey.split('_')[0]
             freqs = np.hstack([freqs, float(freq) / 1000])
             attn = histoKey.split('_')[1]
             attns = np.hstack([attns, float(attn)])
     attnCount = stats.itemfreq(attns)
     freqs = np.unique(freqs)
     attns = np.unique(attns)
     if np.max(attnCount[:, 1]) != np.min(attnCount[:, 1]):
         abortedAttnIdx = np.where(attnCount[:, 1] != np.max(attnCount[:, 1]))
         attns = np.delete(attns, abortedAttnIdx)
     orderedKeys = []
     for attn in attns:
         freqList = []
         for freq in freqs:
             key = str(int(freq * 1000)) + '_' + str(int(attn))
             freqList.append(key)
         orderedKeys.append(freqList)
     return orderedKeys, freqs, attns
def solid_coord_fulldomain(id_field_file):
    # 'id_field_file' is the file name of the full ID field in *nc format
    # NOTE: This input *nc file is not the raw CT file (which has 1->solid; 0->fluid)
    # Assume the fulldomain is raw, i.e. no postprocessed reservoir layers or porous plate
    # Assume the node type convention for LBPM-WIA simulations, which says
    #        id = 0 -> solid phase
    #        id = 1 -> non-wetting phase
    #        id = 2 -> wetting phase
    # -------------------------------------
    print "**Info: Load the image file: "+id_field_file
    domain = read_NetCDF_file_py(id_field_file,'segmented')
    print "**Info: Start analysing the solid coordinate number ......"
    domain = np.logical_not(domain) # Now 1 -> solid nodes; 0 -> fluid nodes
    domain = domain.astype(np.int8)
    
    # Define the D3Q19 lattice
    cx=np.array([0, 1, -1, 0,  0, 0,  0, 1, -1,  1, -1, 1, -1,  1, -1, 0,  0,  0, 0])
    cy=np.array([0, 0,  0, 1, -1, 0,  0, 1, -1, -1,  1, 0,  0,  0,  0, 1, -1,  1,-1])
    cz=np.array([0, 0,  0, 0,  0, 1, -1, 0,  0,  0,  0, 1, -1, -1,  1, 1, -1, -1, 1])
    
    z_axis = 2
    y_axis = 1
    x_axis = 0
    domain_temp = np.zeros_like(domain)
    for idx in np.arange(1,19):
        domain_temp += np.roll(np.roll(np.roll(domain,cx[idx],axis=x_axis),cy[idx],axis=y_axis),cz[idx],axis=z_axis)
    #end for
    domain_temp = domain_temp[domain==0] # only extract the coordinate number for pore space nodes 
                                         # NOTE that we have 0 -> fluid nodes and 1 -> solid nodes in domain
    return stats.itemfreq(domain_temp)
def plotMixNB(X, y, obj, featureNames, whichMix):
	"""Plot MixNB fit on top of X.
	"""
	save_path = '../MSPrediction-Python/plots/'+obj+'/'+whichMix
	clf = classifiers[whichMix]
	clf,_,_ = fitAlgo(clf, X,y, opt= True, param_dict = param_dist_dict[whichMix])
	unique_y = np.unique(y)
	# norm_func = lambda x, sigma, theta: 1 if np.isnan(x) else -0.5 * np.log(2 * np.pi*sigma) - 0.5 * ((x - theta)**2/sigma) 
	# norm_func = np.vectorize(norm_func)
	n_samples = X.shape[0]
	for j in range(X.shape[1]):
		fcol = X[:,j]
		optmodel = clf.optmodels[:,j]
		distname = clf.distnames[j]
		jfeature = featureNames[j]
		jpath = save_path +'_'+jfeature+'.pdf'
		fig = pl.figure(figsize=(8,6),dpi=150)
		for i, y_i in enumerate(unique_y):
			fcoli = fcol[y == y_i]
			itfreq = itemfreq(fcoli)
			uniqueVars = itfreq[:,0]
			freq = itfreq[:,1]
			freq = freq/sum(freq)
			pred = np.exp(optmodel[i](uniqueVars))
			# print pred
			# print pred
			pl.plot(uniqueVars, pred, label= str(y_i)+'_model')
			pl.plot(uniqueVars, freq, label= str(y_i) +'_true')
		pl.xlabel(jfeature)
		pl.ylabel("density")
		pl.title(distname)
		pl.legend(loc='best')
		pl.tight_layout()
		# pl.show()
		fig.savefig(jpath)
Exemple #60
0
 def getC(self):
     """
     Return C.
     """
     idx = itemfreq(self.idxC)
     C = self.X[:, idx[:, 0]]
     return C * np.sqrt(idx[:, 1])