Example #1
0
def cluster_outliers(data, genes, max_samples, min_dist=0.8, mining_id=1, as_json=True):
    estimator = cluster.KMeans(2)  # init kmeans
    samples_from_perc = round(max_samples * len(data.columns) / 100)
    print(samples_from_perc)
    ens = False
    info = None

    if str(genes[0]).startswith("ENSG"):
        res = dr.get_dataset_ensembl_info()
        ens = True
    outliers_id = []
    # debug_count = 0
    if as_json:
        yield (u"{\"outliers\":[")
    for g in genes:
        # if debug_count > 10:
        # break
        try:
            gene_row = data.loc[g, :].dropna()
            gene_row = gene_row.to_frame()

            estimator.fit(gene_row)  # conversion to dframe for model fit
            candidates = itemfreq(estimator.labels_)
            class_zero = candidates[0][1]
            class_one = candidates[1][1]
            support = min(class_one, class_zero)
            majority_class = class_one > class_zero
            dist = abs(max(gene_row[estimator.labels_ == majority_class]) - max(
                gene_row[estimator.labels_ == 1 - majority_class]))
            ran = gene_row.max() - gene_row.min()

            ndist = dist / float(ran)
            print(ndist)
            if 0 < support <= samples_from_perc and min_dist < ndist < 1:
                # debug_count += 1
                if ens:
                    info = [gene for gene in res if gene.ensemblgeneid == g][0]
                    formatted_info = {"identifier": g, "name": info.genename, "type": info.genetype, "samples": str(support),
                                      "distance": str(ndist), "range": str(ran)}
                else:
                    formatted_info = {"identifier": g, "name": "Not available", "type": "Not available", "samples": str(support),
                                      "distance": str(ndist), "range": str(ran)}

                outliers_id.append(formatted_info)
                print("outlier found :" + g)
                if as_json:
                    jinfo = json.dumps(formatted_info)
                    jinfo += u","
                    yield (jinfo)
                else:
                    yield (formatted_info)
        except:
            # if there is an issue on one gene (no variation, clustering impossible) the majority class
            # selection will obviously explode, we capture that in this block and just continue with the next gene (no harm done, there are
            # no outliers when the values are the same)
            pass

    if len(outliers_id) > 0:
        pr.save_outliers(mining_id, outliers_id)
        yield(str(u"]}"))
Example #2
0
def mad_outliers(data, genes, threshold, percentile=95, as_json=True):
    res = dr.get_dataset_ensembl_info()
    outliers_id = []
    if as_json:
        yield ("{\"outliers\":[")
    for g in genes:
        row_values = data.loc[g, :]
        cut_row_values = row_values
        med = cut_row_values.median()
        row_mad = mad(cut_row_values)

        if row_mad != 0.0:
            filtered = (cut_row_values - med) / row_mad
            support = len(filtered[filtered > threshold])

            if scoreatpercentile(filtered, 95) > threshold:

                info = [gene for gene in res if gene.ensemblgeneid == g][0]
                formatted_info = {"id": g, "name": info.genename, "type": info.genetype, "samples": str(support),
                                  "distance": "NA"}
                jinfo = json.dumps(formatted_info)
                jinfo += ","
                outliers_id.append(g)
                print("outlier found :" + g)
                if as_json:
                    yield (jinfo)
                else:
                    yield (formatted_info)
    if len(outliers_id) > 0:
        pr.save_outliers(1, outliers_id)

    if as_json:
        yield ("]}")
Example #3
0
def dbscan_outliers(data, genes, eps, min_samples, max_samples=1, as_json=True):
    db = DBSCAN(eps=eps, min_samples=min_samples)
    # sd_scaler = StandardScaler()
    res = dr.get_dataset_ensembl_info()
    outliers_id = []
    for g in genes:
        # scaled = sd_scaler.fit(data.loc[g, :])
        fit = db.fit(np.reshape(data.loc[g, :], (196, 1)))

        candidates = itemfreq(fit.labels_)

        try:
            class_zero = candidates[0][1]
            class_one = candidates[1][1]

            support = min(class_one, class_zero)

            if min_samples < support <= max_samples:
                info = [gene for gene in res if gene.ensemblgeneid == g][0]
                formatted_info = {"id": g, "name": info.genename, "type": info.genetype, "samples": str(support),
                                  "distance": "NA"}
                jinfo = json.dumps(formatted_info)
                jinfo += ","
                outliers_id.append(g)
                print("outlier found :" + g)
                if as_json:
                    yield (jinfo)
                else:
                    yield (formatted_info)
        except:
            pass