def plot_intersection(ins_dict, save_fig=False):
    """
    Visualize an upsetplot displaying the number of unique subjects found simultaneously
    in a pair of instruments

    Parameters
    ----------
    ins_dict: dictionary
    save_fig: bool
    """

    ins_names = list(ins_dict.keys())
    list_comb = sum([
        list(map(list, combinations(ins_names, i + 1)))
        for i in range(len(ins_names) + 1)
    ], [])
    list_uniquesubj = []
    for lc in list_comb:
        list_uniquesubj.append([set(ins_dict[n].index) for n in lc])
    int_counts = list(map(_count_intersection, list_uniquesubj))
    inter_plot = from_memberships(list_comb, data=int_counts)
    plot(inter_plot,
         show_counts='%d',
         element_size=50,
         orientation='horizontal')
    if save_fig:
        plt.savefig(os.path.join(ut.out_folder, 'intersection_plot'),
                    format='pdf')
    else:
        plt.show()
    def plot_species_intersections(self, color, ignore_counts=0, orientation='horizontal'):
        memberships = []
        data = []

        species_groups, _ = self.orthogroups_sets()

        for k in species_groups:
            memberships.append(k)
            data.append(len(set(species_groups[k])))

        structured_data = from_memberships(memberships, data=data)

        species_dict = {'P8084_finalAssembly': 'P.betacei',
                        'P_cactorum_10300': 'P.cactorum',
                        'P_infestans_RefSeq': 'P. infestans',
                        'P_palmivora_LILI_trCDS': 'P.palmivora',
                        'P_parasitica_INRA310': 'P.parasitica',
                        'P_ramorum_Pr102': 'P.ramorum',
                        'P_sojae_V3': 'P.sojae'}


        new_names = [species_dict[old_name] for old_name in structured_data.index.names]
        structured_data.index.names = new_names

        structured_data = structured_data[structured_data > ignore_counts].copy()
        p = plot(structured_data,
                 orientation=orientation,
                 show_counts=True,
                 facecolor=color,
                 element_size=40)

        return p
Beispiel #3
0
def prepare_intersection_data(data: CardLiveData,
                              type_value: str) -> upsetplot.UpSet:
    """
    Prepare the CardLiveData to generate intersection plots, specifcally
    convert into an UpSet object containing all intersections and cardinalities
    :param data: a CardLiveData object from which the rgi_parser is called
    :param type_value: The category in RGI to plot set membersips for
    :return: An upsetplot.UpSet class containing the intersections and category
             memberships for creating a plotly based UpSet plot
    """

    totals_df = data.rgi_parser.get_column_values(data_type=type_value,
                                                  values_name='categories',
                                                  drop_duplicates=True)
    totals_df = totals_df.dropna()
    category_sets = totals_df.reset_index().groupby('filename')\
                                   .agg(lambda x: tuple(x)).applymap(list)
    category_sets = category_sets['categories']\
                    .apply(lambda x: sorted(x)).sort_values().apply(tuple)
    category_sets = category_sets.value_counts()

    # convert to upset data
    upset_data = upsetplot.from_memberships(category_sets.index,
                                            category_sets.values)
    upset_data = upsetplot.UpSet(upset_data, sort_by='cardinality')
    return upset_data
def create_plot(gnps_task, metadata_column, metadata_terms,
                intensity_threshold):
    data_df = _get_task_df(gnps_task)

    metadata_terms = set(metadata_terms)

    INTENSITY_THRESHOLD = float(intensity_threshold)

    data_df = data_df[data_df["featurearea"] > INTENSITY_THRESHOLD]

    membership = []
    grouped_df = data_df.groupby("featureid")
    for group_df in grouped_df:
        try:
            groups = set(group_df[1][metadata_column])
            groups = list(groups & metadata_terms)

            membership.append(groups)
        except:
            print("ERROR")
            raise

    upset_data_df = from_memberships(membership)

    plotting_object = plot(upset_data_df,
                           subset_size="count",
                           sort_by="cardinality",
                           orientation="horizontal",
                           show_counts=True)

    uuid_save = str(uuid.uuid4())
    pyplot.savefig("./output/{}.svg".format(uuid_save))

    return [html.Img(src="/plot/{}".format(uuid_save))]
Beispiel #5
0
def getLevels(R, L, k):
    n = 1
    while (1 != 0):
        tempR = []
        tempL = []
        upsetD = []
        for i in range(len(R[n])):
            for j in range(len(R[1])):
                if (checkExists(
                        R[1][j],
                        R[n][i]) == False):  # Fix this to work with lists
                    intersectionTID = intersection(L[n][i], L[1][j])
                    if (len(intersectionTID) >= k):
                        if (n == 1):
                            tempR.append([R[n][i], R[1][j]])
                        else:
                            tempR.append(R[n][i] + [R[1][j]])
                        tempL.append(intersectionTID)
        if (len(tempR) == 0):
            return
        R.append(tempR)
        L.append(tempL)
        R[n + 1], L[n + 1] = checkDuplicates(R[n + 1], L[n + 1])
        for i in range(len(L[n + 1])):
            upsetD.append(len(L[n + 1][i]))
        print("\nLevel ", n + 1, "-->  Number of itemsets = ", len(R[n + 1]))
        #print(R[n+1])
        print("\n")
        upset = from_memberships(R[n + 1], data=upsetD)
        upset  # doctest: +NORMALIZE_WHITESPACE
        plot(upset)
        pyplot.show()
        n += 1
Beispiel #6
0
def test_from_memberships_with_data(data, ndim):
    memberships = [[], ['hello'], ['world'], ['hello', 'world']]
    out = from_memberships(memberships, data=data)
    assert out is not data  # make sure frame is copied
    if hasattr(data, 'loc') and np.asarray(data).dtype.kind in 'ifb':
        # but not deepcopied when possible
        if LooseVersion(pd.__version__) > LooseVersion('0.35'):
            assert out.values.base is np.asarray(data).base
    if ndim == 1:
        assert isinstance(out, pd.Series)
    else:
        assert isinstance(out, pd.DataFrame)
    assert_frame_equal(
        pd.DataFrame(out).reset_index(drop=True),
        pd.DataFrame(data).reset_index(drop=True))
    no_data = from_memberships(memberships=memberships)
    assert_index_equal(out.index, no_data.index)

    with pytest.raises(ValueError, match='length'):
        from_memberships(memberships[:-1], data=data)
Beispiel #7
0
    def run(self, output):
        dcount = 0
        dbstr = " ".join(self.dbs)

        if os.path.exists(output + ".raw.tab"):
            print("Starting from previous task")
            with open(output + ".raw.tab", 'r') as input:
                for l in input:
                    s = l.rstrip().split()
                    self.counter[s[0]] = int(s[1])
        else:
            with sp.Popen(f'{self.meryl} print venn {dbstr}',
                          shell=True,
                          stdout=sp.PIPE,
                          bufsize=1,
                          universal_newlines=True) as sf:
                for h in sf.stdout:
                    s = h.split()
                    self.counter[s[1]] += 1
                    dcount += 1
                    if dcount % 10000000 == 0:
                        print(f'Progress: {dcount}')

            # print out raw data
            with open(output + ".raw.tab", 'w') as out:
                for w in sorted(self.counter,
                                key=self.counter.get,
                                reverse=True):
                    out.write(f'{w}\t{self.counter[w]}\n')

            print("Created raw output file")

        # Prepare membership df
        array = list()
        data = list()
        for k, v in self.counter.items():
            tlist = list()
            for i, e in enumerate(self.dbs):
                if (int(k) & (1 << int(i))):
                    # The bit is set, add the file name
                    tlist.append(basename(e).split('.')[0])
            array.append(tlist)
            data.append(v)

        # Plot things out
        dataset = upsetplot.from_memberships(array, data=data)
        print(dataset)

        upset = upsetplot.UpSet(dataset,
                                sort_by='cardinality',
                                show_percentages=True)
        upset.plot()

        plt.savefig(output + ".pdf")
def plot_upset(ax):
    data = np.array([795., 27., 182., 7.])
    # plt.rcParams.update({'font.size': fontsize})
    example = from_memberships(
        [[' TP53 WT', ' MDM4 WT'], [' TP53 WT', ' MDM4 amp.'],
         [' TP53 mutant', ' MDM4 WT'], [' TP53 mutant', ' MDM4 amp.']],
        data=data)
    intersections, matrix, shading, totals = plot(example,
                                                  with_lines=True,
                                                  show_counts=True,
                                                  element_size=50)
    plt.ylabel('Number of patients', fontproperties)
Beispiel #9
0
def upset(index):
    selection = clusters[np.where(sets[:, index] > 0)]
    items, counts = np.unique(selection, return_counts=True)

    subset = from_memberships(items, counts)
    sub_classes = np.unique([item for sublist in items for item in sublist])

    print("Root Class: ", unique_clusters[index])
    print("# Papers: ", len(selection))
    print("# Labels: ", len(sub_classes))
    print("# Classes: ", len(items))
    if len(items) > 40 or len(sub_classes) > 20:
        print("Too many items")
    else:
        plot(subset)
Beispiel #10
0
def test_from_contents_vs_memberships(data, typ, id_column):
    contents = OrderedDict([('cat1', typ(['aa', 'bb', 'cc'])),
                            ('cat2', typ(['cc', 'dd'])),
                            ('cat3', typ(['ee']))])
    # Note that ff is not present in contents
    data_df = pd.DataFrame(data, index=['aa', 'bb', 'cc', 'dd', 'ee', 'ff'])
    baseline = from_contents(contents, data=data_df, id_column=id_column)
    # compare from_contents to from_memberships
    expected = from_memberships(memberships=[{'cat1'}, {'cat1'},
                                             {'cat1', 'cat2'}, {'cat2'},
                                             {'cat3'}, []],
                                data=data_df)
    assert_series_equal(
        baseline[id_column].reset_index(drop=True),
        pd.Series(['aa', 'bb', 'cc', 'dd', 'ee', 'ff'], name=id_column))
    assert_frame_equal(baseline.drop([id_column], axis=1), expected)
Beispiel #11
0
def run():
    fig = plt.figure(figsize=(8, 8))
    data = np.array([795., 27., 182., 7.])
    plt.rcParams.update({'font.size': 14})
    example = from_memberships(
        [
            [' TP53 WT', ' MDM4 WT'],
            [' TP53 WT', ' MDM4 amp.'],
            [' TP53 mutant', ' MDM4 WT'],
            [' TP53 mutant', ' MDM4 amp.']],
        data=data
    )
    intersections, matrix, shading, totals = plot(example, fig=fig, with_lines=True, show_counts=True, element_size=50)
    plt.ylabel('Number of patients', fontdict=dict(weight='bold', fontsize=16))

    filename = join(saving_dir, 'upset_MDM4_TP53.png')
    plt.savefig(filename)
    def plot_clades_intersections(self, color):
        memberships = []
        data = []

        clades_groups, _ = self.orthogroups_sets_clades()

        for k in clades_groups:
            memberships.append(k)
            data.append(len(set(clades_groups[k])))

        structured_data = from_memberships(memberships, data=data)

        p = plot(structured_data,
                 orientation='vertical',
                 show_counts=True,
                 facecolor=color,
                 element_size=100)
        return p
Beispiel #13
0
def load_venn_from_fredy(venn_file_name: str, base_name: str = 'geno'):
    """
    From a venn file generated by fredy as : 
    #Venn:  
    0000 111875
    1000 346
    0100 357
    1100 272
    0010 398
    1010 0
    0110 31
    1110 107
    0001 362
    1001 1
    0101 9
    1101 89
    0011 199
    1011 76
    0111 54
    1111 2057

    generate a data usable by upsetplot:
    venn_data = from_memberships(
        [[],
        ['geno1'],
        ['geno2'],
        ['geno1', 'geno2'],
        ['geno3'],
        ['geno1', 'geno3'],
        ...
        ],
        data=[111875,346,357,272,398,272,...]
    )
    """
    with open(venn_file_name) as venn_file:
        members = [
        ]  # first array = arrays of membership, second array: corresponding counts
        abundances = []
        for line in venn_file.readlines():
            if line[0] == '#': continue
            sline = line.strip().split()
            members.append(membership_line_to_array(sline[0], base_name))
            abundances.append(int(sline[1]))
    return from_memberships(members, data=abundances)
Beispiel #14
0
def plot_graph(res, path):
    """ From upset_plot data Plot upset plots and store corresponding data"""

    path_figures = f"{path}/figures"
    import os
    os.makedirs(path_figures, exist_ok=True)

    for typ_res, dic in res.items():
        liste_cats = sorted(dic.keys())
        data_out = []
        for cat in liste_cats:
            data_out.append(dic[cat])
        example = from_memberships(liste_cats, data=data_out)
        plot(example)
        pyplot.savefig(f"{path_figures}/{typ_res}.png")

    print(f"  figures stored in '{path_figures}/'")

    path_upset = f"{path}/data_upset.json"
    write_json_file(path_upset, [liste_cats, data_out])
    print(f"  output file in upset plot format stored in '{path_upset}'")
Beispiel #15
0
def make_upsetplot(WD, name, data, title):
    """Function to make an UpSetPlot.
    Need this three other functions : similarity_count(), get_clusters(), get_sub_clusters().
    
    ARGS:
        WD (str) -- the working directory to save the result.
        name (str) -- name of the file to save.
        data -- the dictionary containing the organisms as keys
        and the genes/reactions/others to treat for the UpSetPlot.
        title (str) -- title of the graph.
    """

    clusters = get_clusters(list(data.keys()))
    [clusters.insert(0, [key]) for key in data.keys()]
    count = []
    log = ""
    for c in clusters:
        others = list(data.keys())
        listInter = []
        for x in c:
            others.remove(x)
            listInter.append(set(data[x]))
        cluster_data, sim_count = similiraty_count(data, listInter, others)
        count.append(sim_count)
        for i in c:
            log += i + " "
        log += " (" + str(sim_count) + ") :\n"
        for i in cluster_data:
            log += utils.cobra_compatibility(str(i)) + "\n"
        log += "\n------\n\n"
    utils.write_file(WD, name + ".log", log)
    my_upsetplot = from_memberships(clusters, count)
    plot(my_upsetplot, show_counts='%d', totals_plot_elements=3)
    plt.suptitle(title)
    plt.savefig(WD + name + ".pdf")
    plt.show()
Beispiel #16
0
def generate_upset_plot(results, label):
	"""Cette fonction permet d'afficher le upset plot.
	:param: results: dictionnaire dans lequel les résultats des modèles sont stockés
	:param: label: string, nom de l'étiquette à laquelle on s'intéresse
	"""
	models_names = GetModelsNames(results) # On récupère tous les noms des modèles de l'étude
	somme = {} # Génération de toutes les combinaisons entre modèles 
	for p in itertools.chain(*(itertools.combinations(models_names, long) for long in range(1, 4))):
		somme[p] = 0

	# Ici, on compte, pour chaque combinaison, combien d'instances ont la propriété d'appartenir à l'ensemble
	# décrit par la combinaison
	for comb in somme.keys():
		models_to_have = list(comb)
		for instance in results.keys():
			flag_ok = True
			for model in models_names:
				# Si le modèle courant fait partie de ceux attendus et que l'étiquette donnée n'est pas bonne, NON
				if model in models_to_have and results[instance][model] != label:
					flag_ok = False
				# Si le modèle courant ne fait pas partie de ceux attendus mais que l'étiquette donnée est bonne, NON
				if model not in models_to_have and results[instance][model] == label:
					flag_ok = False
			if flag_ok == True:
				somme[comb] += 1
	c, d = ([], [])
	for comb in somme.keys(): # Nettoyage des noms de modèles et initialisation des attributs de l'objet from_memberships
		comb_net = []
		for cc in list(comb):
			comb_net.append(cc.split('/')[-1].replace('.txt', ''))
		c.append(comb_net)
		d.append(somme[comb])
	# Réalisation de l'upset plot
	diagram = from_memberships(c, data=d)
	plot(diagram)
	plt.show()
Beispiel #17
0
def printList(R, L, T, k):
    tempR = []
    tempL = []
    upsetR = []
    upsetD = []
    for i in range(len(R[0])):
        if (len(L[0][i]) >= k):
            tempR.append(R[0][i])
            tempL.append(L[0][i])
            upsetR.append([R[0][i]])
            upsetD.append(len(L[0][i]))
    R.append(tempR)
    L.append(tempL)
    upset = from_memberships(upsetR, data=upsetD)
    upset  # doctest: +NORMALIZE_WHITESPACE
    print("For support threshold k =", k, "\n")
    print("Level 1  -->  Number of itemsets =", len(R[1]))
    #print(R[1])
    print("\n")

    plot(upset)
    pyplot.show()

    getLevels(R, L, k)
Beispiel #18
0
    peaks_i = peak_df_list[i]
    peaks_i['sorted_samples'] = ''
    rows_now = peaks_i.shape[0]
    for j in list(range(rows_now)):
        sample_list = peaks_i.at[j,'sample_reps']
        sample_array = np.unique(sample_list.split(','))
        sample_sorted = sorted(sample_array)
        sample_str = ",".join(sample_sorted)
        peaks_i.at[j,'sorted_samples'] = sample_str
    summary_peaks_i = peaks_i[['sorted_samples', 'count']].groupby(['sorted_samples'], as_index = False).sum()
    summary_peak_df_list.append(summary_peaks_i)

# construct data in appropriate format for upsetplot, and plot
for i in list(range(len(summary_peak_df_list))):
    df_i = summary_peak_df_list[i]
    # Get group name
    basename = os.path.basename(peak_file_list[i])
    group_name = basename.rsplit(".", -1)[0]
    file_name = group_name + ".consensus_peaks.pdf"
    categories = df_i.shape[0]
    cat_list = []
    for j in list(range(categories)):
        summary_sample = df_i.at[j,'sorted_samples'].split(',')
        cat_list.append(summary_sample)

    # Plot
    peak_counts = upsetplot.from_memberships(cat_list, data = df_i['count'])
    upsetplot.plot(peak_counts)
    plt.show()
    plt.savefig(os.path.join(args.outpath, file_name))
Beispiel #19
0
def FindERG(data, depth=2, sort_num=20, verbose=False):
    '''
    Find out endogenous reference gene

    Parameters
    ----------
    data:pandas.DataFrmae
        DataFrame of data points with each entry in the form:['gene_id','sample1',...]
    depth:int
        Accuracy of endogenous reference gene,must be larger that 2
        The larger the number, the fewer genes are screened out,Accuracy improvement
    sort_num:int
    	The size of the  peendogenous reference gener filter
    	When the sample is large, it is recommended to increase the value
    verbose: bool
        Make the function noisy, writing times and results.
    Returns
    -------
    result:list
        a list of endogenous reference gene
    '''
    lp = []
    if verbose:
        import time, datetime
        start = time.time()
    if depth == 1:
        print('the depth must larger than 2')
        return
    if len(data.columns) <= 2:
        print('the number of samples must larger than 2')
        return
    if depth > (len(data.columns) - 1):
        print('depth larger than samples')
        return
    count = 0
    result = []  #result
    bucket_size = 1000
    for i in itertools.combinations(data.columns[0:depth], 2):
        count = count + 1
        test = data.replace(0, np.nan).dropna()

        last_std = pd.DataFrame()
        for k in range(0, len(data), bucket_size):
            test1 = test[i[0]].iloc[k:k + bucket_size]
            test2 = test[i[1]].iloc[k:k + bucket_size]
            data_len = len(test1.values)
            table1 = np.array(test1.values.tolist() * data_len).reshape(
                data_len, data_len)
            table2 = pd.DataFrame(table1.T / table1)
            table2.index = test1.index

            table4 = np.array(test2.values.tolist() * data_len).reshape(
                data_len, data_len)
            table5 = pd.DataFrame(table4.T / table4)
            table5.index = test1.index

            table6 = (table2 - table5).std()
            table6.index = test1.index
            l_std = table6.sort_values()[0:sort_num]
            if (k == 0):
                last_std = l_std
            else:
                last_std = pd.concat([last_std, l_std])

        last_std = last_std.sort_values()[0:sort_num]

        testlist = list(last_std.index)
        #print(testlist)
        lp.append(testlist)
        #print(lllll)
        if (count == 1):
            result = testlist
        if (count > 1):
            result = list(set(testlist).intersection(set(result)))  #Venn
    example = from_memberships(lp, data=range(len(lp)))
    if verbose:
        end = time.time()
        print("calculate time:%.2fs" % (end - start))
        print(result)
    if depth > 2:
        plot(example)
    return result
Beispiel #20
0
def intersect(data, upset_plot=False):
    """A function that returns all possible distinct intersections and generates an upset plot
    Parameters
    ----------
    data = pandas dataframe
    upset_plot = boolean
    
    Returns
    -------
    df_final = dataframe with list of matches for each comparison and counts
    upset = data formatted to generate upset plots """

    #convert data column names to strings
    col_names = []
    for i in data.columns:
        col_names.append(str(i))
    data.columns = col_names

    #total groups
    n = len(col_names)

    #generate all possible combinations for intersection analysis
    comb_list = []
    for i in range(2, n + 1):
        comb_list.append(list(combinations(col_names, i)))

    #find all unique elements and drop na
    unique_elem = []
    tot_elements = []
    for i in col_names:
        unique_elem.append(set(data[i].dropna().to_list()))
    for i in range(len(unique_elem)):
        tot_elements.append([col_names[i], len(unique_elem[i])])

    print("Total unique number of items", tot_elements)

    #make dictionary for unique elements
    dict_ = {}
    for i in range(len(col_names)):
        dict_.update({col_names[i]: unique_elem[i]})

    #intersect data, find distinct sets, drop na
    list_intersect = []
    for i in comb_list:
        for j in i:
            if len(j) == 2:
                func_1 = "set(data['{x}'].dropna().to_list()).intersection(data['{y}'].dropna().to_list())".format(
                    x=j[0], y=j[1])
                inter = eval(func_1)
                dict_adj = []
                for i, k in dict_.items():
                    if i != j[0] and i != j[1]:
                        dict_adj.append(k)
                for i in dict_adj:
                    unique = inter - i
                    inter = unique
                list_intersect.append([j, list(inter), len(list(inter))])
            else:
                func_2 = "set(data['{x}'].dropna().to_list()).intersection(data['{y}'].dropna().to_list())".format(
                    x=j[0], y=j[1])
                cond = "i != j[0] and i != j[1]"
                for _ in range(2, len(j)):
                    decor_1 = ".intersection(data['{z}'].dropna().to_list())".format(
                        z=j[_])
                    decor_2 = " and i != j[{x}]".format(x=_)
                    func_2 = func_2 + decor_1
                    cond = cond + decor_2
                inter = eval(func_2)
                dict_adj = []
                for i, k in dict_.items():
                    if eval(cond):
                        dict_adj.append(k)
                for i in dict_adj:
                    unique = inter - i
                    inter = unique
                list_intersect.append([j, list(inter), len(list(inter))])

    #obtain elements found only in individual datasets
    for j in range(len(col_names)):
        for i in list_intersect:
            if col_names[j] in set(i[0]):
                unique_elem[j] = unique_elem[j] - set(i[1])
        unique_elem[j] = list(unique_elem[j])

    #create dataframe for elements found only in individual datasets
    df_1 = pd.DataFrame(col_names)
    df_1[1] = unique_elem
    df_1[2] = [len(i) for i in unique_elem]

    #combine intersect data and unique elements found within individual sets
    df_2 = pd.DataFrame(list_intersect)
    df_3 = pd.concat([df_1, df_2])
    df_3.columns = ["Intersection", "Match", "Counts"]
    df_3 = df_3.reset_index(drop=True)

    #generate data structure for upset plot
    upset = df_3.drop("Match", axis=1)
    lst_1 = df_3["Intersection"].to_list()
    lst_2 = df_3["Intersection"].to_list()
    for i in range(len(col_names)):
        lst_1[i] = [lst_2[i]]
    upset = from_memberships(lst_1, data=upset["Counts"])

    #make upset plot
    if upset_plot == True:
        plot(upset)

    return df_3, upset
Beispiel #21
0
#    print('Filtering reads for those mapping to small transcript intersections...')
#    filt = set([
#        frozenset(memb_set)
#        for memb_set, count in memb_set_count.items()
#        if count < 10
#    ])
#    plot_read_to_transcripts = {
#        read: transcripts
#        for read, transcripts in read_to_transcripts.items()
#        if frozenset(transcripts) not in filt
#    }
#    print('done.')
#else:
plot_read_to_transcripts = read_to_transcripts

memb = from_memberships(plot_read_to_transcripts.values())
upsetplot.plot(memb,
               subset_size='count',
               show_counts=True,
               sort_by='cardinality')
out_f = '{}.upset.png'.format(out_pref)
print('Plotting Upset to ', out_f)
plt.savefig(out_f, format='png')

n_multimap_viral_endo = sum([
    1 for transcripts in read_to_transcripts.values()
    if viral_gene in set(transcripts) and len(transcripts) > 1
])
n_mapped_endo = sum([
    1 for transcripts in read_to_transcripts.values()
    if len(set(transcripts) & set(endo_trans)) > 1
Beispiel #22
0
def test_from_memberships_no_data(typ):
    with pytest.raises(ValueError, match='at least one set'):
        from_memberships([])
    with pytest.raises(ValueError, match='at least one set'):
        from_memberships([[], []])
    with pytest.raises(ValueError, match='strings'):
        from_memberships([[1]])
    with pytest.raises(ValueError, match='strings'):
        from_memberships([[1, 'str']])
    with pytest.raises(TypeError):
        from_memberships([1])

    out = from_memberships([typ([]),
                            typ(['hello']),
                            typ(['world']),
                            typ(['hello', 'world']),
                            ])
    exp = pd.DataFrame([[False, False, 1],
                        [True, False, 1],
                        [False, True, 1],
                        [True, True, 1]],
                       columns=['hello', 'world', 'ones']
                       ).set_index(['hello', 'world'])['ones']
    assert isinstance(exp.index, pd.MultiIndex)
    assert_series_equal(exp, out)

    # test sorting by name
    out = from_memberships([typ(['hello']),
                            typ(['world'])])
    exp = pd.DataFrame([[True, False, 1],
                        [False, True, 1]],
                       columns=['hello', 'world', 'ones']
                       ).set_index(['hello', 'world'])['ones']
    assert_series_equal(exp, out)
    out = from_memberships([typ(['world']),
                            typ(['hello'])])
    exp = pd.DataFrame([[False, True, 1],
                        [True, False, 1]],
                       columns=['hello', 'world', 'ones']
                       ).set_index(['hello', 'world'])['ones']
    assert_series_equal(exp, out)