Esempio n. 1
0
    def _get_html_stats(self):
        from sequana.tools import StatsBAM2Mapped
        from easydev import precision
        data = StatsBAM2Mapped(self.directory + "bwa_mem_stats.json").data
        html = "Reads with Phix: %s %%<br>" % precision(
            data['contamination'], 3)

        # add HTML table
        if "R2_mapped" in data.keys():
            df = pd.DataFrame({
                'R1': [data['R1_mapped'], data['R1_unmapped']],
                'R2': [data['R2_mapped'], data['R2_unmapped']]
            })
        else:
            df = pd.DataFrame({'R1': [data['R1_mapped'], data['R1_unmapped']]})
        df.index = ['mapped', 'unmapped']

        datatable = DataTable(df, "bwa_bam")
        datatable.datatable.datatable_options = {
            'scrollX': '300px',
            'pageLength': 30,
            'scrollCollapse': 'true',
            'dom': 'irtpB',
            "paging": "false",
            'buttons': ['copy', 'csv']
        }
        js = datatable.create_javascript_function()
        html_tab = datatable.create_datatable(float_format='%.3g')
        #html += "{} {}".format(html_tab, js)

        html += "Unpaired: %s <br>" % data['unpaired']
        html += "duplicated: %s <br>" % data['duplicated']
        return html
Esempio n. 2
0
    def _get_df(self):
        if self._df is None:
            self.reset()
            N = 0
            
            all_results = []
            for read in self.data:
                res = []
                # count reads
                N += 1
                if (N % 10000) == 0:
                    print("Read %d sequences" %N)
                #res[0] = read length
                res.append(read.query_length)
                # res[1] = GC content
                c = collections.Counter(read.query_sequence)
                res.append( (c['g'] + c['G'] + c['c'] + c['C'])/float(sum(c.values())) )
                # res[2] = snr A
                # res[3] = snr C
                # res[4] = snr G
                # res[5] = snr T
                snr = list([x for x in read.tags if x[0]=='sn'][0][1])
                res = res + snr
                #res[6] = ZMW name
                res.append(read.qname.split('/')[1])
                
                # aggregate results
                all_results.append(res)

            self._df = pd.DataFrame(all_results, columns=['read_length','GC_content','snr_A','snr_C','snr_G','snr_T','ZMW'])
            self._N = N
            self.reset()     
        return self._df
Esempio n. 3
0
 def _get_df(self, method_name):
     data = getattr(self, method_name)()
     df = pd.DataFrame({
         "name": self.get_projects(),
         "value": data,
         "url": self.get_urls()})
     return df
Esempio n. 4
0
    def get_actg_content(self, max_sample=500000):
        try: self.alignments
        except: self._set_alignments()
        # what is the longest string ?
        max_length = max((len(a.seq) for a in self.alignments))
        import re
        df = pd.DataFrame(np.zeros((max_length,5)), columns=['A', 'C', 'G', 'T', 'N'])
        A = np.zeros(max_length)
        C = np.zeros(max_length)
        G = np.zeros(max_length)
        T = np.zeros(max_length)
        N = np.zeros(max_length)

        for a in self.alignments:
            pos = [m.start() for m in re.finditer("A", a.seq)]
            A[pos] += 1
            C[[m.start() for m in re.finditer("C", a.seq)]] += 1
            G[[m.start() for m in re.finditer("G", a.seq)]] += 1
            T[[m.start() for m in re.finditer("T", a.seq)]] += 1
            N[[m.start() for m in re.finditer("N", a.seq)]] += 1

        df["A"] = A
        df["C"] = C
        df["T"] = T
        df["G"] = G
        df["N"] = N

        df = df.divide(df.sum(axis=1), axis=0)

        return df
Esempio n. 5
0
    def plot_unknown_barcodes(self, N=20):
        ub = self.data['UnknownBarcodes']
        df = pd.DataFrame({x['Lane']: x['Barcodes'] for x in ub})
        if "unknown" in df.index and len(df) == 1:
            df.loc['known'] = [0 for i in df.columns]

        # if data is made of undetermined only, the dataframe is just made of
        # N lanes with one entry : unknown
        S = df.sum(axis=1).sort_values(ascending=False).index[0:N]
        data = df.loc[S][::-1]
        #print(data)

        data.columns = ["Lane {}".format(x) for x in data.columns]
        from matplotlib import rcParams
        rcParams['axes.axisbelow'] = True
        pylab.figure(figsize=(10, 8))
        ax = pylab.gca()
        data.plot(kind="barh", width=1, ec="k", ax=ax)
        rcParams['axes.axisbelow'] = False
        pylab.xlabel("Number of reads", fontsize=12)
        pylab.ylabel("")
        pylab.grid(True)
        pylab.legend(
            ["Lane {}".format(x) for x in range(1,
                                                len(df.columns) + 1)],
            loc="lower right")
        try:
            pylab.tight_layout()
        except Exception as err:
            print(err)
        return data
Esempio n. 6
0
    def _parse_data(self):
        taxonomy = {}

        logger.info("Reading kraken data")
        columns = ["status", "taxon", "length"]
        # we select only col 0,2,3 to save memoty, which is required on very
        # large files
        try:
            # each call to concat in the for loop below
            # will take time and increase with chunk position.
            # for 15M reads, this has a big cost. So chunksize set to 1M
            # is better than 1000 and still reasonable in memory
            reader = pd.read_csv(self.filename,
                                 sep="\t",
                                 header=None,
                                 usecols=[0, 2, 3],
                                 chunksize=1000000)
        except pd.parser.CParserError:
            raise NotImplementedError  # this section is for the case
            #only_classified_output when there is no found classified read
            self.unclassified = N  # size of the input data set
            self.classified = 0
            self._df = pd.DataFrame([], columns=columns)
            self._taxons = self._df.taxon
            return

        for chunk in reader:
            try:
                self._df
                self._df = pd.concat([self._df, chunk])
            except AttributeError:
                self._df = chunk

        self._df.columns = columns

        count = sum(self._df.taxon == 1)
        if count:
            logger.warning("Found %s taxons with root ID (1)" % count)

        # This gives the list of taxons as index and their amount
        # above, we select only columns 0, 2, 3  the column are still labelled
        # 0, 2, 3 in the df
        self._taxons = self._df.groupby("taxon").size()
        try:
            self._taxons.drop(0, inplace=True)
        except:
            pass  # 0 may not be there
        self._taxons.sort_values(ascending=False, inplace=True)

        category = self.df.groupby("status").size()

        if 'C' in category.index:
            self.classified = category['C']
        else:
            self.classified = 0

        if 'U' in category.index:
            self.unclassified = category['U']
        else:
            self.unclassified = 0
Esempio n. 7
0
    def get_flags_as_df(self):
        """Returns flags as a dataframe

        .. doctest::

            >>> from sequana import BAM, sequana_data
            >>> b = BAM(sequana_data('test.bam'))
            >>> df = b.get_flags_as_df()
            >>> df.sum()
            1       1000
            2        484
            4          2
            8          2
            16       499
            32       500
            64       477
            128      523
            256       64
            512        0
            1024       0
            2048       0
            dtype: int64

        .. seealso:: :class:`SAMFlags` for meaning of each flag
        """
        flags = self.get_flags()
        data = [(this, [flag&this for flag in flags])
            for this in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]]
        df = pd.DataFrame(dict(data))
        df = df > 0
        return df
Esempio n. 8
0
    def _get_df(self):
        if self._df is None:
            self.reset()
            N = 0

            all_results = []
            for read in self.data:
                res = []
                # count reads
                N += 1
                if (N % 10000) == 0:
                    print("Read %d sequences" % N)
                #res[0] = read length
                res.append(read.query_length)
                # res[1] = GC content
                c = collections.Counter(read.query_sequence)
                res.append(100 * (c['g'] + c['G'] + c['c'] + c['C']) /
                           float(sum(c.values())))

                # aggregate results
                all_results.append(res)

            self._df = pd.DataFrame(all_results,
                                    columns=['read_length', 'GC_content'])
            self._N = N
            self.reset()
        return self._df
Esempio n. 9
0
    def imshow_qualities(self):
        """Qualities

        ::

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.imshow_qualities()
            from pylab import tight_layout; tight_layout()

        """
        tiles = self._get_tile_info()
        d = defaultdict(list)
        for tile, seq in zip(tiles['tiles'], self.qualities):
            d[tile].append(seq)
        self.data_imqual = [pd.DataFrame(d[key]).mean().values for key in sorted(d.keys())]

        from biokit.viz import Imshow
        im = Imshow(self.data_imqual)
        im.plot(xticks_on=False, yticks_on=False, origin='lower')
        pylab.title("Quality per tile", fontsize=self.fontsize)
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("tile number")
Esempio n. 10
0
 def find_motif_fasta(self, filename, motif, window=200,
         local_threshold=None, global_threshold=None):
     from sequana import FastA
     data = FastA(filename)
     N = len(data)
     from easydev import Progress
     pb = Progress(N)
     df = {
         "query_name": [],
         "hit": [],
         "length": [],
         "start": [],
         "end": []
     }
     for i, item in enumerate(data):
         X1, S = self.find_motif_from_sequence(item.sequence, motif,
                     window=window, local_threshold=local_threshold
                     )
         if S >= self.global_threshold:
             df['query_name'].append(item.name)
             df['start'].append(0)
             df['end'].append(len(item.sequence))
             df['length'].append(len(item.sequence))
             df['hit'].append(S)
         pb.animate(i+1)
     df = pd.DataFrame(df)
     return df
Esempio n. 11
0
    def get_table_dependencies(self):
        """ Return dependencies of Sequana.
        """
        dep_list = easydev.get_dependencies('sequana')
        # if installed with conda, this will be empty
        if len(dep_list) == 0:
            return ""

        project_name = list()
        version = list()
        link = list()
        pypi = 'https://pypi.python.org/pypi/{0}'
        for dep in dep_list:
            version.append(dep.version)
            project_name.append(dep.project_name)
            link.append(pypi.format(dep.project_name))
        df = pd.DataFrame({
            'package': project_name,
            'version': version,
            'link': link
        })
        df['sort'] = df['package'].str.lower()
        df.sort_values(by='sort', axis=0, inplace=True)
        df.drop('sort', axis=1, inplace=True)
        datatable = DataTable(df, 'dep')
        datatable.datatable.datatable_options = {
            'paging': 'false',
            'bFilter': 'false',
            'bInfo': 'false',
            'bSort': 'false'
        }
        datatable.datatable.set_links_to_column('link', 'package')
        js = datatable.create_javascript_function()
        html = datatable.create_datatable()
        return js + '\n' + html
Esempio n. 12
0
def read_blasr_result(file_blasr, blasr_columns):
    res_blasr = []
    f = open(file_blasr, 'r')
    l = f.readline()
    while (l):
        contig = l.replace("\n", "").replace("  ", " ").split(" ")
        res_blasr.append(contig)
        l = f.readline()
    f.close()

    res_blasr = pd.DataFrame(res_blasr)
    res_blasr.columns = blasr_columns

    # convert to numeric
    res_blasr["qLength"] = res_blasr["qLength"].astype(int)
    res_blasr["score"] = res_blasr["score"].astype(float)
    # extract alignment columns
    res_align = res_blasr[[
        "qName", "qLength", "qAlignedSeq", "tAlignedSeq", "matchPattern"
    ]]
    # drop alignment columns (useless and too heavy)
    # res_blasr.drop("qAlignedSeq",axis=1)
    res_blasr.drop(["qAlignedSeq", "tAlignedSeq", "matchPattern"],
                   axis=1,
                   inplace=True)

    return res_blasr, res_align
Esempio n. 13
0
    def get_df(self):
        import pandas as pd
        data = {}
        for sample, filename in zip(self.sample_names, self.filenames):
            df = pd.read_csv(filename)
            df = df.groupby("kingdom")['percentage'].sum()
            # if a taxon is obsolete, the kingdom is empty.
            # We will set the kingdom as Unclassified and raise a warning
            # if the count is > 5%
            if " " in df.index:
                percent = df.loc[" "]
                if percent > 5:
                    logger.warning(
                        "Found {}% of taxons in obsolete category".format(
                            percent))
                if "Unclassified" in df.index:
                    df.loc['Unclassified'] += df.loc[' ']
                    df.drop(" ", inplace=True)
                else:
                    df.loc['Unclassified'] = df.loc[' ']
                    df.drop(" ", inplace=True)
            data[sample] = df

        df = pd.DataFrame(data)
        #df.to_json(output.data)
        df = df.sort_index(ascending=False)
        return df
Esempio n. 14
0
    def summary(self):
        """ Add information of filter.
        """
        Sdefault = self.rnadiff.summary()
        self.rnadiff.log2_fc = 1
        S1 = self.rnadiff.summary()

        # set options
        options = {
            'scrollX': 'true',
            'pageLength': 20,
            'scrollCollapse': 'true',
            'dom': '',
            'buttons': []
        }

        S = pd.concat([Sdefault, S1])

        N = len(Sdefault)
        df = pd.DataFrame({
            'comparison_link': [1] * len(S),
            'comparison':
            S.index.values,
            'Description':
            ['Number of DGE (any FC)'] * N + ['Number of DGE (|FC| > 1)'] * N,
            'Down':
            S['down'].values,
            'Up':
            S['up'].values,
            'Total':
            S['all'].values
        })
        df = df[[
            'comparison', 'Description', 'Down', 'Up', 'Total',
            'comparison_link'
        ]]

        df['comparison_link'] = [f"#{name}_table_all" for name in Sdefault.index] + \
                                [f"#{name}_table_sign" for name in Sdefault.index]

        dt = DataTable(df, 'dge')
        dt.datatable.set_links_to_column('comparison_link',
                                         'comparison',
                                         new_page=False)
        dt.datatable.datatable_options = options
        js_all = dt.create_javascript_function()
        html = dt.create_datatable(float_format='%d')
        self.sections.append({
            'name':
            "Summary",
            'anchor':
            'filters_option',
            'content':
            f"""<p>Here below is a summary of thfinal Differententially Gene
Expression (DGE) analysis. You can find two entries per comparison. The first
one has no filter except for an adjusted p-value of 0.05. The second shows the
expressed genes with a filter of the log2 fold change of 1 (factor 2 in a normal
scale). Clicking on any of the link will lead you to section of the comparison. 
{js_all} {html} </p>"""
        })
Esempio n. 15
0
    def get_data_reads(self):

        lanes = []
        names = []
        reads = []

        for i, lane in enumerate(self.data["ConversionResults"]):
            total = 0
            for this in lane['DemuxResults']:
                lanes.append(lane['LaneNumber'])
                names.append(this['SampleId'])
                reads.append(this['NumberReads'])
                total += this['NumberReads']

            # if only undetermined (no sample sheet), no data should be found
            # meaning that there is 0 determined and no names associated
            # so we call it determined and store 0
            if total == 0:
                assert total == 0
                names.append("Determined")
                reads.append(0)
                lanes.append(lane["LaneNumber"])

            if "Undetermined" in lane:
                names.append("Undetermined")
                lanes.append(i + 1)
                reads.append(lane['Undetermined']['NumberReads'])
            else:
                names.append("Undetermined")
                lanes.append(i + 1)
                reads.append(0)
        #print(lanes, names, reads)

        df = pd.DataFrame({"lane": lanes, "name": names, "count": reads})
        return df
Esempio n. 16
0
    def merge(self, overlap=0.2):
        df = pd.concat([self.df1, self.df2]).sort_values(['chr', 'start'])
        # if overlap at least one base, we merge the peaks and label them with
        # common information, otherwise we report the original peak

        merged = []
        prev = None
        overlaps = 0
        N1 = 0
        N2 = 0
        N12 = 0
        skip_next = True
        for k, current in df.iterrows():
            if skip_next:
                prev = current
                skip_next = False
                continue

            # if current overlaps the prev start or end, there is overlap
            # or if current included in prev there current and prev overlaps
            if current['start'] <= prev['start'] and current['end'] >= prev[
                    'start']:
                overlap = True
                N12 += 1
            elif current['start'] <= prev['end'] and current['end'] >= prev[
                    'end']:
                overlap = True
                N12 += 1
            elif current['start'] >= prev['start'] and current['end'] <= prev[
                    'end']:
                overlap = True
                N12 += 1
            else:
                overlap = False
                if prev['name'].startswith('1_vs_6_7'):
                    N1 += 1
                elif prev['name'].startswith('2_vs_6_7'):
                    N2 += 1

            if overlap:
                m = min(current['start'], prev['start'])
                M = max(current['end'], prev['end'])
                data = current.copy()
                data['start'] = m
                data['end'] = M
                data['stop'] = M  #FIXME same as end. decided on one value
                data['category'] = 'both'
                merged.append(data)
                skip_next = True
            else:
                m = min(current['start'], prev['start'])
                M = max(current['end'], prev['end'])
                merged.append(prev)
                skip_next = False

            prev = current
        df = pd.DataFrame(merged)
        df = df.reset_index(drop=True)
        return df
Esempio n. 17
0
    def get_data(self, ontologies, include_negative_enrichment=True, fdr=0.05):

        if isinstance(ontologies, str):
            ontologies = [ontologies]
        else:
            assert isinstance(ontologies, list)
        # First, we select the required ontologies and build a common data set
        all_data = []
        for ontology in ontologies:
            data = self.enrichment[ontology]['result']
            if isinstance(data, dict):
                # there was only one hit, we expect:
                data = [data]
            all_data.extend(data)
        data = all_data

        # remove unclassified GO terms
        unclassified = [
            x for x in data if x['term']['label'] == "UNCLASSIFIED"
        ]
        logger.info("Found {} unclassified".format(len(unclassified)))
        data = [x for x in data if x['term']['label'] != "UNCLASSIFIED"]

        df = pd.DataFrame(data)
        if len(df) == 0:
            return df
        else:
            logger.info("Found {} GO terms".format(len(df)))

        df = df.query("number_in_list!=0").copy()
        logger.info(
            "Found {} GO terms with at least 1 gene in reference".format(
                len(df)))

        # extract the ID and label
        df['id'] = [x['id'] for x in df['term']]
        df['label'] = [x['label'] for x in df['term']]

        # some extra information for convenience
        df["pct_diff_expr"] = df['number_in_list'] * 100 / df[
            'number_in_reference']
        df["log2_fold_enrichment"] = pylab.log2(df['fold_enrichment'])
        df["abs_log2_fold_enrichment"] = abs(pylab.log2(df['fold_enrichment']))

        # Some user may want to include GO terms with fold enrichment
        # significanyly below 1 or not.
        if include_negative_enrichment is False:
            df = df.query("fold_enrichment>=1").copy()
            logger.info(
                "Found {} GO terms after keeping only positive enrichment".
                format(len(df)))

        # filter out FDR>0.05
        df = df.query("fdr<=@fdr").copy()
        logger.info("Found {} GO terms after keeping only FDR<{}".format(
            len(df), fdr))

        return df
Esempio n. 18
0
    def to_html(self):
        data = self.data

        html = "Reads with Phix: %s %%<br>" % precision(data['contamination'], 3)

        # add HTML table
        if "R2_mapped" in data.keys():
            df = pd.DataFrame({
              'R1': [data['R1_mapped'], data['R1_unmapped']],
              'R2': [data['R2_mapped'], data['R2_unmapped']]})
        else:
            df = pd.DataFrame({
              'R1': [data['R1_mapped'], data['R1_unmapped']]})
        df.index = ['mapped', 'unmapped']

        html += "Unpaired: %s <br>" % data['unpaired']
        html += "duplicated: %s <br>" % data['duplicated']
        return html
Esempio n. 19
0
 def _dict_to_df(self, region_list, annotation):
     """ Convert dictionary as dataframe.
     """
     merge_df = pd.DataFrame(region_list)
     colnames = ["chr", "start", "end", "size", "mean_cov", "max_cov",
                 "mean_rm", "mean_zscore", "max_zscore", "gene_start",
                 "gene_end", "type", "gene", "strand", "product"]
     if not annotation:
         colnames = colnames[:9]
     merge_df = pd.DataFrame(region_list, columns=colnames)
     int_column = ["start", "end", "size"]
     merge_df[int_column] = merge_df[int_column].astype(int)
     if annotation:
         merge_df.rename(columns={"gene": "gene_name"}, inplace=True)
         # maybe let the user set what he wants
         return merge_df.loc[~merge_df["type"].isin(
             FilteredGenomeCov._feature_not_wanted)]
     return merge_df
Esempio n. 20
0
    def get_taxonomy_biokit(self, ids):
        """Retrieve taxons given a list of taxons

        :param list ids: list of taxons as strings or integers. Could also
            be a single string or a single integer
        :return: a dataframe

        .. note:: the first call first loads all taxons in memory and takes a
            few seconds but subsequent calls are much faster
        """
        # filter the lineage to keep only information from one of the main rank
        # that is superkingdom, kingdom, phylum, class, order, family, genus and
        # species
        ranks = ('kingdom', 'phylum', 'class', 'order', 'family', 'genus',
                 'species')

        if isinstance(ids, int):
            ids = [ids]

        if len(ids) == 0:
            return pd.DataFrame()

        logger.info('Retrieving taxon using biokit.Taxonomy')

        if isinstance(ids, list) is False:
            ids = [ids]

        lineage = [self.tax.get_lineage_and_rank(x) for x in ids]
        # Now, we filter each lineage to keep only relevant ranks
        # We drop the 'no rank' and create a dictionary
        # Not nice but works for now
        results = []
        for i, this in enumerate(lineage):
            default = dict.fromkeys(ranks, ' ')
            for entry in this:
                if entry[1] in ranks:
                    default[entry[1]] = entry[0]
                elif entry[1] == "superkingdom":
                    default["kingdom"] = entry[0]
            # Scientific name is the last entry tagged has no_rank  following
            # species TODO (check this assumption)
            # e.g. 351680 and 151529 have same 7 ranks so to differenatiate
            # them, the scientific name should be used.
            # By default, we will take the last one. If species or genus, we
            # repeat the term
            try:
                default['name'] = this[-1][0]
            except:
                default['name'] = "root (ambigous kingdom)"
            results.append(default)

        df = pd.DataFrame.from_records(results)
        df.index = ids
        df = df[list(ranks) + ['name']]
        df.index = df.index.astype(int)

        return df
Esempio n. 21
0
    def __init__(self, data):

        # if data is a dataframe, keep it else, transform to dataframe
        try:
            self.df = pd.DataFrame(data)
        except:
            self.df = data

        self.xmax = self.df.shape[1]
        self.X = None
Esempio n. 22
0
def get_heatmap_df():
    """a simple example to play with and perform test"""
    import pandas as pd

    df = pd.DataFrame({
        "A": [1, 0, 1, 1],
        "B": [0.9, 0.1, 0.6, 1],
        "C": [0.5, 0.2, 0, 1],
        "D": [0.5, 0.2, 0, 1],
    })
    return df
Esempio n. 23
0
    def summary(self):
        """ Get a summary DataFrame from a RNADiff analysis.
        """
        summary = pd.DataFrame(
            {
                k: {x: len(self.dr_gene_lists[k][x]) for x in self.dr_gene_lists[k]}
                for k in self.dr_gene_lists
            }
        )

        return summary
Esempio n. 24
0
    def __init__(self, x, y=None, verbose=False):
        self.verbose = verbose

        self.xy_names = ['x', 'y']
        if isinstance(x, pd.DataFrame) is True:
            self.df = x.copy()
            columns = list(self.df.columns)
            columns[0] = 'x'
            columns[1] = 'y'
            self.xy_names = self.df.columns[0:2]
            self.df.columns = columns
        elif y is None:
            # could be a list of lists, a pandas-compatible dictionary
            self.df = pd.DataFrame(x)
            if self.df.shape[1] != 2:
                if self.df.shape[0] == 2:
                    print("warning transposing data")
                    self.df = self.df.transpose()
        elif x is not None and y is not None:
            self.df = pd.DataFrame({'x':x, 'y':y})
Esempio n. 25
0
    def plot_dendogram(
        self,
        max_features=5000,
        transform_method="log",
        method="ward",
        metric="euclidean",
    ):
        # for info about metric and methods: https://tinyurl.com/yyhk9cl8

        assert transform_method in ["log", "anscombe", None]
        # first we take the normalised data
        from sequana.viz import clusterisation
        from sequana.viz import dendogram

        cluster = clusterisation.Cluster(self.counts_norm)
        # cluster = clusterisation.Cluster(self.df[self.sample_names])
        if transform_method is not None:
            data = cluster.scale_data(transform_method=transform_method,
                                      max_features=max_features)
            df = pd.DataFrame(data[0])
            df.index = data[1]
            df.columns = self.counts_norm.columns
        else:
            df = pd.DataFrame(self.counts_norm)
            # df.index = data[1]
            df.columns = self.counts_norm.columns

        d = dendogram.Dendogram(
            df.T,
            metric=metric,
            method=method,
            side_colors=list(self.design_df.group_color.unique()),
        )

        # Convert groups into numbers for Dendrogram category
        group_conv = {
            group: i
            for i, group in enumerate(self.design_df[self.condition].unique())
        }
        d.category = self.design_df[self.condition].map(group_conv).to_dict()
        d.plot()
Esempio n. 26
0
    def summary(self):
        """ Get a summary DataFrame from a RNADiff analysis.
        """
        summary = pd.DataFrame(
            {
                (x,len(self.gene_lists[x])) for x in self.gene_lists.keys()
            }
        )

        df = summary.set_index(0)
        df.columns = ["_vs_".join(self.condition_names)]
        return df
Esempio n. 27
0
    def plot_feature_most_present(self):
        """"""

        df = []

        for x, y in self.counts_raw.idxmax().iteritems():

            most_exp_gene_count = self.counts_raw.stack().loc[y, x]
            total_sample_count = self.counts_raw.sum().loc[x]

            df.append({
                "label":
                x,
                "gene_id":
                y,
                "count":
                most_exp_gene_count,
                "total_sample_count":
                total_sample_count,
                "most_exp_percent":
                most_exp_gene_count / total_sample_count * 100,
            })

        df = pd.DataFrame(df).set_index("label")
        df = pd.concat([self.design_df, df], axis=1)

        pylab.clf()
        p = pylab.barh(
            df.index,
            df.most_exp_percent,
            color=df.group_color,
            zorder=10,
            lw=1,
            ec="k",
            height=0.9,
        )

        for idx, rect in enumerate(p):
            pylab.text(
                2,  # * rect.get_height(),
                idx,  # rect.get_x() + rect.get_width() / 2.0,
                df.gene_id.iloc[idx],
                ha="center",
                va="center",
                rotation=0,
                zorder=20,
            )

        self._format_plot(
            # title="Counts monopolized by the most expressed gene",
            # xlabel="Sample",
            xlabel="Percent of total reads", )
        pylab.tight_layout()
Esempio n. 28
0
 def boxplot_mapq_concordance(self, method):
     # method can only be bwa for now
     assert method == "bwa"
     data = self._get_data(method)
     df = pd.DataFrame(data, columns=["mapq", "length", "concordance"])
     pylab.clf()
     pylab.boxplot([df[df.mapq == i]['concordance'] for i in range(1, 61)])
     pylab.xlabel("mapq")
     pylab.ylabel("concordance")
     pylab.grid()
     tt = [10, 20, 30, 40, 50, 60]
     pylab.xticks(tt, tt)
Esempio n. 29
0
    def summary(self):

        return pd.DataFrame(
            {
                "log2_fc": self._log2_fc,
                "alpha": self._alpha,
                "up": len(self.gene_lists["up"]),
                "down": len(self.gene_lists["down"]),
                "all": len(self.gene_lists["all"]),
            },
            index=[self.name],
        )
Esempio n. 30
0
    def __init__(self, data, na=0):
        """.. rubric:: Constructor

        Plots the content of square matrix that contains correlation values.

        :param data: input can be a dataframe (Pandas), or list of lists (python) or
            a numpy matrix. Note, however, that values must be between -1 and 1. If not,
            or if the matrix (or list of lists) is not squared, then correlation is
            computed. The data or computed correlation is stored in :attr:`df` attribute.
        :param bool compute_correlation: if the matrix is non-squared or values are not
            bounded in -1,+1, correlation is computed. If you do not want that behaviour,
            set this parameter to False. (True by default).
        :param na: replace NA values with this value (default 0)

        The :attr:`params` contains some tunable parameters for the colorbar in the
        :meth:`plot` method.

        ::

            # can be a list of lists, the correlation matrix is then a 2x2 matrix
            c = corrplot.Corrplot([[1,1], [2,4], [3,3], [4,4]])

        """
        super(Corrplot, self).__init__()
        #: The input data is stored in a dataframe and must therefore be
        #: compatible (list of lists, dictionary, matrices...)
        self.df = pd.DataFrame(data, copy=True)

        compute_correlation = False

        w, h = self.df.shape
        if self.df.max().max() > 1 or self.df.min().min() < -1:
            compute_correlation = True
        if w != h:
            compute_correlation = True
        if list(self.df.index) != list(self.df.columns):
            compute_correlation = True

        if compute_correlation:
            print("Computing correlation")
            cor = self.df.corr()
            self.df = cor

        # replace NA with zero
        self.df.fillna(na, inplace=True)

        #: tunable parameters for the :meth:`plot` method.
        self.params = {
            'colorbar.N': 100,
            'colorbar.shrink': .8,
            'colorbar.orientation': 'vertical'
        }