Example #1
0
 def names(self, names):
     spacer.info('\n\n')
     prv_names = self.names
     if type(names) is not dict:
         if len(names) != len(prv_names):
             spacer.error('\n')
             logger.error('The passed list of element names ({}) must '
                          'have the same length as the current one ({}).'
                          .format(len(names), len(prv_names)))
             sys.exit(1)
         df = pd.DataFrame({'current names: ': prv_names, 
                            'passed names: ': names})
         logger.info('Ensure previous and passed names align:'
                     '\n{}\n'.format(df.to_string()))
         names = dict(zip(prv_names, names))
     else:
         inv = [k for k in names if k not in prv_names]
         if inv:
             spacer.error('\n')
             logger.error('Keys of the passed mapping are not containd in '
                          'current element names: {}'.format(inv))
             sys.exit(1)
     self._update_data_columns(names, func_name='rename')
     self._colors = {nn: self._colors[pn] for pn, nn in names.items() 
                     if pn in self._colors}
     if self._ctrl and (self._ctrl in names):
         self._ctrl = names[self._ctrl]
     logger.info('`{}` renamed. New names:\n{}'.format(self.name, 
                                                       self.names))
Example #2
0
def get_ensgs(names, species):
    """ Return the ensg keys for a list of gene names.

    DPre references the ensembl gene annotation v.96 located at 
    DPre/gene_ann. If a gene name has multiple ensg keys, this gene will appear 
    last in the DataFrame regardless of the input order.
    
    Args:
        names (list pandas.Index): The collection of names to return ensg keys 
            for
        species (str): The origin species of the genes, 'mouse' or 'human'.
    
    Returns:
        pandas.Index of ensg keys 
    """
    ref = _get_gene_ann(species)
    try:
        ann = ref.reindex(ref.index[ref.name.isin(names)]).reset_index()
        if ann.name.duplicated().any():
            dupl = pd.Index(ann.name).duplicated()
            ann_dr = ann[~dupl]
            ann_du = ann[dupl]
            ann_dr = ann_dr.set_index('name').reindex(names).reset_index()
            ann_dr.rename({'index': 'name'}, axis=1, inplace=1)
            ann = ann_dr.append(ann_du, sort=False)
            ann.index = np.arange(ann.shape[0])
        else:
            ann = ann.set_index('name').reindex(names).reset_index()
            ann.rename({'index': 'name'}, axis=1, inplace=1)
        return ann
    except Exception as e:
        logger.error(
            '{}\nDPre references the ensembl gene annotaiton v.96. '
            'Differently annotated datasets may cause problems.'.format(e))
        sys.exit(1)
Example #3
0
def plot_color_legend(labels, colors, ncolumns=1, filename='color_legend'):
    """Plot a custom color legend.
    
       Takes a list of labels and colors and links them to produce a color 
       legend. Useful for marking  sub-groups in samples/ targets elements.

    Args:
        labels (list): the list of labels in the legend
        colors (list): the list of colors correspoding to the labels. Colors 
            must be interpretable by matplotlib: for example, 'w', #ffffff, 
            (1,1,1) all refer to white.
        filename (str, optional): the filename to save the legend. Defaults to
            './color_legend.' + config.SAVE_FORMAT
        ncolumns (int, optional): the number of columns in the legend. Defaults 
            to 1.
    """
    spacer.info('\n\n')
    assert len(colors) == len(labels), 'colors and labels differ in length'
    inv_cols = [c for c in colors if not is_color_like(c)]
    if inv_cols:
        logger.error('The following colors are not recognized as colors by '
                     'matplotlib: {}'.format(inv_cols))
        sys.exit(1)
    filename, pp = _open_file(filename)
    fig, ax = plt.subplots(1, 1, figsize=(4, 4))
    _clean_axes(np.array([ax]))
    ax.legend(handles=[
        Patch(color=colors[i], label=labels[i]) for i in range(len(colors))
    ],
              loc='center',
              ncol=ncolumns)
    _save_file(fig, filename=filename, pp=pp, close_pp=True)

    logger.info('Color legend generated and saved at {}/{}'.format(
        os.path.abspath(os.curdir), filename))
Example #4
0
    def reorder(self, order):
        """Reorder the elements in targets/samples inplace

            Args:
                order (list): the new order of elements

            Note:
                If not all current element names or new element names are 
                passed, an error is raised.
        """
        spacer.info('\n\n')
        not_ctnd = list(filter(lambda o: o not in self.names, order))
        missing = list(filter(lambda n: n not in order, self.names))
        if missing:
            spacer.error('')
            logger.error('The passed order misses current element names: {}'
                         .format(missing))
            sys.exit(1)
        if not_ctnd:
            spacer.error('')
            logger.error('Invalid element names. Passed elements not contained '
                         'in current element names:\n{}'.format(not_ctnd))
            sys.exit(1)
        self._update_data_columns(order)
        logger.info('`{}` reordered. New order of elements:\n{}'
                    .format(self.name, self.names))
Example #5
0
 def check_path(direc):
     if not os.path.exists(direc):
         spacer.info('')
         logger.error('Could not change directory to {}\nCheck the '
                      'path.'.format(os.path.abspath(direc)))
         sys.exit(1)
     files = glob.glob(direc + '/*.tsv')
     if not files:
         spacer.info('')
         logger.error('No *.tsv files found in {}\nCheck the path.'.format(
             os.path.abspath(direc)))
         sys.exit(1)
Example #6
0
def _get_gene_ann(species):
    """Open the gene annotation reference file (mouse/ human) and return it"""
    path = os.path.dirname(__file__)
    if species == 'mouse':
        return pd.read_pickle(path +
                              '/../gene_ann/mg_ensembl96_GRCm38.p6.gzip')
    elif species == 'human':
        return pd.read_pickle(path + '/../gene_ann/hg_GRCh38.p12.gzip')
    else:
        logger.info('')
        logger.error('Invalid input for species: `{}`. Valid are `mouse` and '
                     '`human`'.format(species))
        sys.exit(1)
Example #7
0
    def slice_elements(self, elements, name=None, inplace=False, log=True):
        """Slice the targets/samples to a specific list of elements. Return a 
        copy of the original.

        Args:
            elements (list): the list of elements to slice. 
        
        Note:
            If the passed names are not found in the current element names, 
            an error is raised.

        Returns:
            sliced: the sliced targets/samples instance
        """
        if log:
            spacer.info('\n\n')
        if elements is None or not len(elements):
            spacer.error('')
            logger.error('The list of elements cannot be empty.')
            sys.exit(1)
        not_ctnd = list(filter(lambda e: e not in self.names, elements))
        if not_ctnd:
            spacer.error('')
            logger.error('Invalid element names. Passed elements not contained '
                         'in current element names:\n{}'.format(not_ctnd))
            sys.exit(1)

        if not inplace:
            sliced = copy.copy(self)
        else:
            sliced = self
        if sliced._type_name == 'samples': 
            if sliced._ctrl and (sliced._ctrl not in elements):
                sliced._ctrl = None
        elif self._type_name == 'targets':
            self._trg_sims.clear()
            self._gene_sims.clear()
        sliced._update_data_columns(elements)
        [sliced._colors.pop(k, None) for k in sliced.names if k not in elements]
        sliced.name = name if name else sliced.name

        if log:
            logger.info('`{}` sliced:'.format(self.name))
            spacer.info('')
            sliced._log_init(log)
        if not inplace:
            return sliced
Example #8
0
def _open_file(filename):
    """Open a file based on the filename ending or if not present
       on config.SAVE_FORMAT. Must be supporte by matplotlib."""
    valid = plt.figure().canvas.get_supported_filetypes()
    if not any([filename.endswith(val_format) for val_format in valid]):
        if config.SAVE_FORMAT in valid:
            filename += '.' + config.SAVE_FORMAT
        else:
            logger.error(
                'The value for config.SAVE_FORMAT `{}` is not '
                'supported by matplotlib. Valid formats are:\n{}'.format(
                    config.SAVE_FORMAT, ', '.join(list(valid.keys()))))
            sys.exit(1)

    if filename.endswith('.pdf'):
        return filename, PdfPages(filename)
    else:
        return filename, None
Example #9
0
    def _log_init(self, log):
        """Check if expression or differential was passed, then log the 
        initiated targets/samples result

        Args:
            log: log initiation, otherwise only check input
            diff_n: differential name, 'markergenes' for targets diff. genes'
                for samples
        """
        diff_n = 'markergenes' if self._type_name == 'targets' else 'diff. genes'
        if not self._has_diff and not self._has_expr:
            spacer.error('')
            cmd_pref = self._type_name.lower()+'_' if log == 'from_cmd' else ''
            logger.error('Failed to init {}:\nAt least one of `{}expression` '
                         'and `{}` must be passed.'
                         .format(self._type_name, cmd_pref, diff_n))
            sys.exit(1)
        if not log:
            return
        # assemble log message by checking the various inputs
        if self._has_diff:
            d = self._diff.any(axis=1).sum()
            n_diff_descr = (', of which {} {}'.format(d, diff_n))
        elif self._has_expr:
            n_diff_descr = ''

        if self._type_name == 'targets':
            diffmgs = 'Markergenes'
            trg_smp_arg = 'Down markergenes loaded: {}'.format(self._down_mgs)
        elif self._type_name == 'samples':
            diffmgs = 'Differential genes'
            trg_smp_arg = ('Control passed: {}'
                           .format(self._ctrl if self._ctrl else False))
        msg = ('{}\nNew {}-instance created: `{}`\n\t{} ({}):\n\t\t{}\n\t'
               'Detected genes: {}{}\n\t{} loaded: {}\n\t'
               'Expression data loaded: {}\n\t{}\n'
               .format(log_init, self._type_name, self.name,
                       self._type_name, len(self), ',\n\t\t'.join(self.names), 
                       len(self._detec_genes), n_diff_descr, diffmgs, 
                       self._has_diff, self._has_expr, trg_smp_arg))
        logger.info(msg)
Example #10
0
def annotate(ensgs, species):
    """ Annotate mouse or human ensg keys. Return the gene names.

    DPre references the ensembl gene annotation v.96 located at 
    DPre/gene_ann. 

    Args:
        ensgs (list, pandas.Index): The collection of ensg keys to annotate
        species (str): The origin species of the genes, 'mouse' or 'human'.
    
    Returns:
        annotated pandas.Index
    """
    ref = _get_gene_ann(species)
    try:
        return pd.Index(ref.reindex(ensgs).name.values)
    except Exception as e:
        logger.error(
            '{}\nDPre references the ensembl gene annotaiton v.96. '
            'Differently annotated datasets may cause problems.'.format(e))
        sys.exit(1)
Example #11
0
 def check_up_down_genelists():
     # check if 2 elements were passed, i.e. up+down genelist input
     if isinstance(diff_genes_dir,
                   (list, tuple)) and len(diff_genes_dir) == 2:
         # check if paths are valid
         check_path(diff_genes_dir[0])
         check_path(diff_genes_dir[1])
         # get the single TSV filenames
         up_dir = glob.glob(diff_genes_dir[0] + '/*.tsv')
         down_dir = glob.glob(diff_genes_dir[1] + '/*.tsv')
         # up and down must have the same number of elements
         if len(up_dir) != len(down_dir):
             msg = (
                 'Number of up- and down genelist files differ. Found {} '
                 '*.tsv files in up directory\n{}\n{} *tsv files in down '
                 'directory:\n{}\n'.format(
                     len(up_dir), os.path.abspath(diff_genes_dir[0]),
                     len(down_dir), os.path.abspath(diff_genes_dir[1])))
             logger.error(msg)
             sys.exit(1)
         # to match up and down together safely, filenames must be the same
         f_up = [f[f.rfind(os.sep) + 1:] for f in up_dir]
         f_down = [f[f.rfind(os.sep) + 1:] for f in down_dir]
         is_single = lambda n: (n not in f_up) or (n not in f_down)
         singles = list(filter(is_single, set((*f_up, *f_down))))
         if singles:
             logger.error('Names of up- and down genelist files differ. '
                          'Names only found in one of the two '
                          'directories ({}):\n{}'.format(
                              len(singles), singles))
             sys.exit(1)
         # return the the up directory and that down mgs were passed
         return diff_genes_dir[0], True
     # a list of len=1 is treated as one element, both don't have down mgs
     elif isinstance(diff_genes_dir, (list, tuple)):
         check_path(diff_genes_dir[0])
         return diff_genes_dir[0], False
     else:
         check_path(diff_genes_dir)
         return diff_genes_dir, False
Example #12
0
 def check_input_type():
     test_df = pd.read_csv(files[0], sep='\t')
     # check test dataframe for proper ensg index
     index_col = 'ensg'
     if 'ensg' not in test_df.columns:
         index_col = test_df.columns[0]
         if not str(test_df[index_col][0]).startswith('ENS'):
             spacer.error('')
             logger.error(
                 'The *.tsv files holding the gene keys do not '
                 'have  a column `ENS*` nor do they seem to have '
                 'an ensg index in the first column: {}, {}, ...'.format(
                     *test_df[index_col][:2].tolist()))
             sys.exit(1)
     # deseq2 output is identified based on the column names
     deseq2_cols = [
         'Unnamed: 0', 'baseMean', 'log2FoldChange', 'lfcSE', 'stat',
         'pvalue', 'padj'
     ]
     inp_type =  'deseq2' if test_df.columns.tolist() == deseq2_cols \
                         else 'genelist ({})'.format(genelists_mgtype)
     return inp_type, index_col
Example #13
0
def add_diff_genes_from_z(samples, diff_z_threshold=2):
    if not samples._ctrl:
        logger.error('The samples `{}` were not initialized with a control.'
                     'To generate a list of differential genes, a control is '
                     'required.'.format(samples.name))
        sys.exit(1)
    expr = samples._expr.xs('z', 1, 1, False)
    expr = expr.apply(lambda smp: smp - expr.loc(1)[(samples._ctrl, 'z')])

    up = expr.mask(~(expr > diff_z_threshold), False).astype(bool)
    up.columns = pd.MultiIndex.from_product([['up'], up.columns.unique(0)])
    down = expr.mask(~(expr < -diff_z_threshold), False).astype(bool)
    down.columns = pd.MultiIndex.from_product([['down'],
                                               down.columns.unique(0)])

    samples._diff = pd.concat((up, down), axis=1)
    samples._has_diff = True
    spacer.info('\n')
    n = samples._diff.sum().unstack(0).reindex(samples.names).to_string()
    logger.info(
        'Differential genes were added to the sample. Number of marker '
        'genes:\n{}\n'.format(n))
Example #14
0
    def get_colors(self, order=None):
        """Get the targets/samples colors of `order`

        Args:
            order (list, optional): A listing of element names in which the 
            element colors will be returned. Defaults to the instances element 
            order. 
        
        Note:
            Not contained element names raise an error. If the color for the 
            requested element hasn't been set, the color is set to white. 
        
        Return:
            list of list of colors
        """
        if not self._colors:
            self.set_colors(['#ffffff'], log=False)
            logger.warning('Colors have not been set for `{}`. All have been '
                           'set to default: white'.format(self.name))
        if order is not None:
            not_ctnd = list(filter(lambda o: o not in self.names, order)) 
            if not_ctnd:
                logger.error('Failed to get colors of `{}`. The passed order '
                             'contains elements other than the element names: '
                             '{}'.format(self.name, not_ctnd))
                sys.exit(1)
            not_set = list(filter(lambda o: o not in self._colors, order)) 
            if not_set:
                self.set_colors(dict.fromkeys(not_set, '#ffffff'), log=False)
                logger.warning('Failed to get some colors of `{}`. The passed '
                               'order contains elements without a set color: {}'
                               '. Colors for these were set to default: white.'
                               .format(self.name, not_set))
        else:
            if order is None:
                order = self.names
        return [self._colors[k] for k in order]
Example #15
0
    def _is_expr_diff_compatible(self, override_namematcher=False):
        """Check if expression and differential input share element labels

        Args:
            override_namematcher: Whether a misalignment between expression- and
                gene list names should be ignored. Useful when names refer to
                the same data but labels differ partially.
        """
        expr_ns = self.names
        diff_ns = self._diff.columns.unique(-1).tolist()
        diff_n = 'markergenes' if self._type_name == 'targets' else 'diff. genes'
        spacer.info('\n\n')
        if len(expr_ns) != len(diff_ns):
            if (len(expr_ns) == len(diff_ns)+1) and self._type_name == 'samples':
                msg = ('{} ({}) has one element less than expression ({}). '
                       .format(diff_n, len(diff_ns), len(expr_ns)))
                if self._ctrl:
                    # assume it is the contrl missing
                    msg += ('An empty element `{}` (control) will be added to '
                            'diff. genes.'.format(self._ctrl))
                    logger.info(msg)

                    # add control to match with expression element names
                    self._diff[('up', self._ctrl)] = False
                    self._diff[('down', self._ctrl)] = False
                    self._diff.sort_index(axis=1, inplace=True)
                    diff_ns = self._diff.columns.unique(-1).tolist()
                else:
                    msg += ('If the expression data has a control that is '
                            'missing in diff. genes, you can resolve this by '
                            'passing the control name for samples initiation.')
                    logger.info(msg)
                    sys.exit(1)

            else:
                logger.error('Passed expression ({}) and {} ({}) do not have '
                             'the same number of elements. Check input.'
                             .format(len(expr_ns), diff_n, len(diff_ns)))
                sys.exit(1)
        
        align = [e_n == d_n for e_n, d_n in zip(expr_ns, diff_ns)]
        df = pd.DataFrame({'expression names': expr_ns, 
                           diff_n + ' names': diff_ns, 
                           'match': align})
        if all(align) or override_namematcher:
            spacer.info('')
            msg = diff_n + ' names have been overriden by expression names. '
            lvls = self._diff.columns.unique(0), expr_ns
            self._diff.columns = pd.MultiIndex.from_product(lvls)
            if override_namematcher:
                logger.warning('{}CAUTION: {}- and expression names '
                               'do not match for all elements:\n{}\nMake sure '
                               'data aligns to avaid mislabeling!'
                               .format(msg, diff_n, df.to_string()))
        else:
            spacer.error('')
            logger.error(('{0}- and expression element names do '
                          'not match:\n{1}\nRename elements in expression '
                          '/{0} input files or override the '
                          '{0} names by setting `override_namematcher` '
                          'to True.'.format(diff_n, df.to_string())))
            sys.exit(1)
Example #16
0
def preset_targets(get,
                   sort=False,
                   preset_colors=True,
                   color_legend_filename=True,
                   color_legend_ncols=1):
    """Generate one of the predefined targets instances and return it. 
    
        Pick a reference dataset for comparison. Mouse (Hutchins et al. 2017, 
        NAR) and Human (Abugessaisa et al. 2017, FANTOM5 project) are included. 
        Specific doamins can be picked for both species references. If the 
        targets are initiated with 'preset_colors', a color legend is generated 
        and saved in the current working directory. Custom presets can be 
        created by adding a folder (with an 'm' or 'h' prefix) to 
        DPre/preset_targets. 

    Args:
        get (str): the desired preset. Valid options are 'mouse', 'human', 
            'm embryonic', 'm germ cells', 'm neural crest', 
            'm surface ectoderm', 'm neuroectoderm', 'm mesoderm', 'm endoderm', 
            'm blood mesoderm', 'h surface ectoderm', 'h neuroectoderm', 
            'h mesoderm', 'h endoderm', 'h blood mesoderm'; m = mouse, h = human
        sort (bool, optional): Sort the loaded element names alphabetically. 
            Defaults to False.
        preset_colors (bool, optional): Tries to initiate the targets with preset 
            colors either from colors.tsv in the respective preset directory or 
            when not found from config.preset_targets_colors. Defaults to True.
        color_legend_filename (bool, str, optional): The filename when a preset
            color legend is drawn from config.preset_col_legend. When True, a
            filename is inferred from the targets name and config.SAVE_FORMAT, 
            a str is set as the filename. Defaults to True. When None, the color
            legend is not drawn.
        color_legend_ncols (int, optional): Number of columns in the color 
            legend. Defaults to 1.
    
    Returns:
        t: the preset targets instance
    """
    path = os.path.dirname(__file__)
    # any folder in DPre/preset_targets is potentially valid
    valid = os.listdir(os.path.join(path, '..', 'preset_targets'))
    if get not in valid:
        spacer.info('')
        logger.error(
            '`{}` is not a valid preset target. Valid ones are {}'.format(
                get, valid))
        sys.exit(1)

    # try to get .gzip markergene and expression input, if not found try .tsv
    get_dir = '{}/../preset_targets/{}'.format(path, get)
    expr = mgs = None
    if os.path.exists('{}/markergenes.gzip'.format(get_dir)):
        mgs = pd.read_pickle('{}/markergenes.gzip'.format(get_dir))
    elif os.path.exists('{}/markergenes.tsv'.format(get_dir)):
        mgs = pd.read_csv('{}/markergenes.tsv'.format(get_dir),
                          sep='\t',
                          index_col=0,
                          header=[0, 1])
        mgs.to_pickle('{}/markergenes.gzip'.format(get_dir))

    if os.path.exists('{}/expression.gzip'.format(get_dir)):
        expr = pd.read_pickle('{}/expression.gzip'.format(get_dir))
    elif os.path.exists('{}/expression.tsv'.format(get_dir)):
        expr = pd.read_csv('{}/expression.tsv'.format(get_dir),
                           sep='\t',
                           index_col=0,
                           header=[0, 1])
        expr.to_pickle('{}/expression.gzip'.format(get_dir))
    elif os.path.exists('{}/expression_h1.gzip'.format(get_dir)):
        expr1 = pd.read_pickle('{}/expression_h1.gzip'.format(get_dir))
        expr2 = pd.read_pickle('{}/expression_h2.gzip'.format(get_dir))
        expr = pd.concat([expr1, expr2], axis=1)
        expr.to_pickle('{}/expression.gzip'.format(get_dir))

    if sort:
        mgs.sort_index(axis=1, inplace=True)
        expr.sort_index(axis=1, inplace=True)

    # explicit part of the script that might need adjustmet for cumstom presets
    if get == 'human':
        args = {'name': 'human FANTOM5 library', 'species': 'human'}
    elif get == 'mouse':
        args = {'name': 'mouse lineages', 'species': 'mouse'}
    elif get.startswith('h '):
        args = {'name': get[2:] + ' lineage', 'species': 'human'}
    elif get.startswith('m '):
        args = {'name': get[2:] + ' lineage', 'species': 'mouse'}
    # init targets
    from DPre.main.targets import targets
    t = targets(markergenes=mgs, expression=expr, log=False, **args)
    logger.info(
        'Default targets `{}` created, name: `{}`, elements: {}'.format(
            get, t.name, len(t)))

    # try to get colors first through a file, then through config
    if preset_colors:
        try:
            df_colors = pd.read_csv('{}/colors.tsv'.format(get_dir),
                                    sep='\t',
                                    index_col=0)
            t.set_colors(dict(zip(df_colors.index, df_colors.color)),
                         log=False)
        except FileNotFoundError:
            if get in config.preset_targets_colors:
                t.set_colors([config.preset_targets_colors[get]], log=False)
            else:
                logger.warning(
                    'No colors found for preset targets {}'.format(get))
        # draw a colorlegend if defined in config
        if get in config.preset_col_legend and color_legend_filename:
            filename = get+'_color_legend' if color_legend_filename == True \
                       else color_legend_filename
            util.plot_color_legend(*config.preset_col_legend[get],
                                   ncolumns=color_legend_ncols,
                                   filename=filename)
    return t
Example #17
0
    def check_metric(metric, trg, smp, diff):
        # check if the samples and targets have equivalent data to compare
        if smp._type_name != 'samples':
            logger.error('The passed `samples` are not of type DPre.smaples.')
            sys.exit(1)
        if metric is None:
            if trg._has_expr and smp._has_expr:
                metric = 'cosine'
            elif trg._has_diff and smp._has_diff:
                metric = 'intersect'
            else:
                logger.error('Either initiate targets and samples with '
                             'expression or with markergenes and diff genes.')
                sys.exit(1)
        msg = 'The {} were initiated without {} data. Cannot use `{}` similarity.'
        if metric not in ('euclid', 'cosine', 'pearson', 'intersect'):
            logger.error('Invalid `metric` input: `{}`. Valid are `euclid`, '
                         '`cosine`, `pearson`, and `intersect`'.format(metric))
            sys.exit(1)
        if metric in ['euclid', 'cosine', 'pearson']:
            if not trg._has_expr:
                logger.error(msg.format('targets', 'expression', metric))
                sys.exit(1)
            elif not smp._has_expr:
                logger.error(msg.format('samples', 'expression', metric))
                sys.exit(1)
                if diff and not smp._ctrl:
                    logger.error(
                        'To plot the changes in transcriptional similarity '
                        'with metric = `{}`, the samples must be initiated '
                        'with a control. For absolute, pass differential = '
                        'False.'.format(metric))
                    sys.exit(1)
        elif metric == 'intersect':
            if not trg._has_diff:
                logger.error(msg.format('targets', 'merker gene', metric))
                sys.exit(1)
            elif not smp._has_diff:
                logger.error(msg.format('samples', 'diff genes', metric))
                sys.exit(1)

        return metric
Example #18
0
def _check_args(trg,
                smp,
                metric,
                differential,
                hide_distance_bar=None,
                reorder_to_distance_bar=None,
                distance_bar_range=None,
                cluster_hmx=None,
                display_markergenes=False):
    """General purpose plot argument checker; returns (modified) input values"""
    def check_metric(metric, trg, smp, diff):
        # check if the samples and targets have equivalent data to compare
        if smp._type_name != 'samples':
            logger.error('The passed `samples` are not of type DPre.smaples.')
            sys.exit(1)
        if metric is None:
            if trg._has_expr and smp._has_expr:
                metric = 'cosine'
            elif trg._has_diff and smp._has_diff:
                metric = 'intersect'
            else:
                logger.error('Either initiate targets and samples with '
                             'expression or with markergenes and diff genes.')
                sys.exit(1)
        msg = 'The {} were initiated without {} data. Cannot use `{}` similarity.'
        if metric not in ('euclid', 'cosine', 'pearson', 'intersect'):
            logger.error('Invalid `metric` input: `{}`. Valid are `euclid`, '
                         '`cosine`, `pearson`, and `intersect`'.format(metric))
            sys.exit(1)
        if metric in ['euclid', 'cosine', 'pearson']:
            if not trg._has_expr:
                logger.error(msg.format('targets', 'expression', metric))
                sys.exit(1)
            elif not smp._has_expr:
                logger.error(msg.format('samples', 'expression', metric))
                sys.exit(1)
                if diff and not smp._ctrl:
                    logger.error(
                        'To plot the changes in transcriptional similarity '
                        'with metric = `{}`, the samples must be initiated '
                        'with a control. For absolute, pass differential = '
                        'False.'.format(metric))
                    sys.exit(1)
        elif metric == 'intersect':
            if not trg._has_diff:
                logger.error(msg.format('targets', 'merker gene', metric))
                sys.exit(1)
            elif not smp._has_diff:
                logger.error(msg.format('samples', 'diff genes', metric))
                sys.exit(1)

        return metric

    # checks for all plots
    metric = check_metric(metric, trg, smp, differential)
    if metric == 'intersect' and not differential:
        differential = True
        logger.warning('For the `intersect` similarity metric, '
                       'differential cannot be False. Was set to True.')

    # checks for 2 heatmaps
    if metric != 'intersect' and not hide_distance_bar and not smp._ctrl:
        hide_distance_bar = True
        logger.warning(
            '`hide_distance_bar` must be True '
            'for metric = `{}` if the samples data is '
            'initialized without a control. Set to True.'.format(metric))
    if reorder_to_distance_bar and hide_distance_bar:
        reorder_to_distance_bar = False
        logger.warning('When `reorder_to_distance_bar` is True, '
                       '`hide_distance_bar` cannot be True. Set '
                       'to False.')
    if reorder_to_distance_bar and cluster_hmx:
        cluster_hmx = False
        logger.warning('Both `reorder_to_distance_bar` and '
                       '`cluster_genes` were set as True. '
                       '`cluster_genes` will be ignored.')
    if not differential and distance_bar_range is not None:
        distance_bar_range = None
        logger.warning('The argument `distance_bar_range` is invalid '
                       'and ignored when differential = False. To apply'
                       ' a custom range, please use "heatmap_range".')

    if display_markergenes is not False:
        # checks for target_sim and ranked_sim plots
        val = ['mean', 'up', 'down']
        if display_markergenes not in val:
            logger.warning('Invalid input for display_markergenes: `{}`. '
                           'Valid are {}. Set to default `{}`'.format(
                               display_markergenes, val, val[0]))
            display_markergenes = val[0]

        if display_markergenes == val[2] and not trg._down_mgs:
            logger.error('Cannot display down markergene similarity because'
                         ' the targets were not initiated with down '
                         'markergenes.')
            sys.exit(1)

    return metric, differential, hide_distance_bar, reorder_to_distance_bar, \
           distance_bar_range, cluster_hmx, display_markergenes
Example #19
0
def _format_expr(expr, type_name, ctrl=None):
    """ Take user expression input validate and format

    If a TSV file is passed, read the expresion file as a DataFrame. Check
    if the DataFrame has a valid format. If the control is passed, check if it's
    found in expression. Finally, generate and add the log2- and z-transformed 
    data.

    Args:
        expr: Filename or Dataframe. The data to check.
        type_name: 'targets' or 'samples', depending on caller
        ctrl: Control name, only passed when called from samples
    
    Returns:
        expr: Expression DataFrame with log2- and z-transformed data at column 
            level 1

    """
    if not isinstance(expr, pd.DataFrame):
        if not os.path.exists(expr):
            spacer.info('')
            logger.error('Invalid path: {}\n'.format(os.path.abspath(expr)))
            sys.exit(1)

        expr = pd.read_csv(expr, sep='\t')
        if 'ensg' in expr.columns:
            expr.set_index('ensg', inplace=True)
        else:
            expr.set_index(expr.columns[0], inplace=True)

    met = [
        c for c in ('loc', 'name', 'tss_loc', 'strand') if c in expr.columns
    ]
    if met:
        expr.drop(met, axis=1, inplace=True)
    inv = expr.columns[expr.dtypes == object].tolist()
    if inv:
        spacer.warning('\n')
        logger.warning('Invalid columns of datatype `object` (often text) '
                       'in expression data: {}\nThese columns will be '
                       'removed.'.format(inv))
        expr.drop(inv, axis=1, inplace=True)
    isna = expr.isna()
    if isna.any().any():
        spacer.error('\n')
        logger.error('Invalid expression data: data contains NaN values.')
        sys.exit(1)
    elif ctrl and (ctrl not in expr.columns.unique(0)):
        spacer.error('\n')
        logger.error('The control name of the samples `{}` was not found in '
                     'the passed expression data.'.format(ctrl))
        sys.exit(1)

    if expr.columns.nlevels > 1:
        exp_idx = [(name, dt) for name in expr.columns.unique(0)
                   for dt in ['log2', 'z']]
        idx = expr.columns.values.tolist()
        misma = list(filter(lambda i: i not in exp_idx, idx))
        if any(misma):
            spacer.error('')
            msg = (
                '\tInvalid expresion data. When passing data with log2- and '
                'z-data, the columns must be a MultiIndex in which level 0 '
                'holds the names: [`name1`, ...] and level 1 the data types:'
                ' [`log2`, `z`]. Expected column indices ({}):\n\t\t{}\n\t '
                'Passed, unexpected column indices ({}):\n\t\t{}'.format(
                    len(exp_idx), exp_idx, len(misma), misma))
            logger.error(msg)
            sys.exit(1)
        else:
            return expr
    else:
        return util._add_log2_z(expr)
Example #20
0
def _format_diff_genes(diff_genes_dir, genelists_mgtype='up', type_name=None):
    """Take user gene list input input and format

    A single directory with deseq2 output files, a single dir. with up-genelist 
    files or 2 dirs. with up- and down- genelists are formatted here. A bool 
    DataFrame that holds the up- (and optionally down) differential genes is 
    returned.

    Args:
        diff_genes_dir: deseq2 directory, up-genelists dir. or list of up- and 
        down genelist dirs..
    genelists_mgtype: Which genelist type to handle. Internally used for 
        recursion 
    type_name: 'targets' or 'samples', depending on caller

    Returns:
        formatted _diff DataFrame
    """

    # check if path exists and contains TSV files
    def check_path(direc):
        if not os.path.exists(direc):
            spacer.info('')
            logger.error('Could not change directory to {}\nCheck the '
                         'path.'.format(os.path.abspath(direc)))
            sys.exit(1)
        files = glob.glob(direc + '/*.tsv')
        if not files:
            spacer.info('')
            logger.error('No *.tsv files found in {}\nCheck the path.'.format(
                os.path.abspath(direc)))
            sys.exit(1)

    # check if up and down genelist directories are compatible
    def check_up_down_genelists():
        # check if 2 elements were passed, i.e. up+down genelist input
        if isinstance(diff_genes_dir,
                      (list, tuple)) and len(diff_genes_dir) == 2:
            # check if paths are valid
            check_path(diff_genes_dir[0])
            check_path(diff_genes_dir[1])
            # get the single TSV filenames
            up_dir = glob.glob(diff_genes_dir[0] + '/*.tsv')
            down_dir = glob.glob(diff_genes_dir[1] + '/*.tsv')
            # up and down must have the same number of elements
            if len(up_dir) != len(down_dir):
                msg = (
                    'Number of up- and down genelist files differ. Found {} '
                    '*.tsv files in up directory\n{}\n{} *tsv files in down '
                    'directory:\n{}\n'.format(
                        len(up_dir), os.path.abspath(diff_genes_dir[0]),
                        len(down_dir), os.path.abspath(diff_genes_dir[1])))
                logger.error(msg)
                sys.exit(1)
            # to match up and down together safely, filenames must be the same
            f_up = [f[f.rfind(os.sep) + 1:] for f in up_dir]
            f_down = [f[f.rfind(os.sep) + 1:] for f in down_dir]
            is_single = lambda n: (n not in f_up) or (n not in f_down)
            singles = list(filter(is_single, set((*f_up, *f_down))))
            if singles:
                logger.error('Names of up- and down genelist files differ. '
                             'Names only found in one of the two '
                             'directories ({}):\n{}'.format(
                                 len(singles), singles))
                sys.exit(1)
            # return the the up directory and that down mgs were passed
            return diff_genes_dir[0], True
        # a list of len=1 is treated as one element, both don't have down mgs
        elif isinstance(diff_genes_dir, (list, tuple)):
            check_path(diff_genes_dir[0])
            return diff_genes_dir[0], False
        else:
            check_path(diff_genes_dir)
            return diff_genes_dir, False

    # check whether deseq2 files or genelist files were passed
    def check_input_type():
        test_df = pd.read_csv(files[0], sep='\t')
        # check test dataframe for proper ensg index
        index_col = 'ensg'
        if 'ensg' not in test_df.columns:
            index_col = test_df.columns[0]
            if not str(test_df[index_col][0]).startswith('ENS'):
                spacer.error('')
                logger.error(
                    'The *.tsv files holding the gene keys do not '
                    'have  a column `ENS*` nor do they seem to have '
                    'an ensg index in the first column: {}, {}, ...'.format(
                        *test_df[index_col][:2].tolist()))
                sys.exit(1)
        # deseq2 output is identified based on the column names
        deseq2_cols = [
            'Unnamed: 0', 'baseMean', 'log2FoldChange', 'lfcSE', 'stat',
            'pvalue', 'padj'
        ]
        inp_type =  'deseq2' if test_df.columns.tolist() == deseq2_cols \
                            else 'genelist ({})'.format(genelists_mgtype)
        return inp_type, index_col

    # glue files together; return a list of pd.Series (columns)
    def merge_files():
        diffs = []
        for file in files:
            name = file[file.rfind('\\') + 1:-4]
            usecols = (0, 2, 6) if inp_t == 'deseq2' else None
            d = pd.read_csv(file,
                            sep='\t',
                            index_col=index_col,
                            usecols=usecols)
            if d.isna().any().any():
                spacer.info('')
                logger.warning(
                    '{} NaN values found and deleted in {}.tsv'.format(
                        d.isna().any(1).sum(), name))
            if inp_t == 'deseq2':
                sig = d.padj < config.DESEQ2_P
                up_reg = (sig & (d.log2FoldChange > 0)).rename(('up', name))
                down_reg = (sig & (d.log2FoldChange < 0)).rename(
                    ('down', name))
                diffs.append(pd.concat((up_reg, down_reg), axis=1))
            else:
                # genelists_mgtype is `down` in recursive function call
                s = pd.Series(True,
                              index=d.index,
                              name=(genelists_mgtype, name))
                diffs.append(s)
        return diffs

    direc, get_down_genelists = check_up_down_genelists()
    files = glob.glob(direc + '/*.tsv')
    inp_t, index_col = check_input_type()
    # if the samples are initiated from genelists, down mgs are required
    if type_name == 'samples' and inp_t != 'deseq2' and not get_down_genelists:
        spacer.error('')
        logger.error('When initiateing the samples diff. genes from genelist '
                     'input, both an up- and down directory with respective '
                     'genelists must be passed.')
        sys.exit(1)

    spacer.info('')
    st_st = [f[f.rfind(os.sep) + 1:] for f in (files[0], files[-1])]
    f_type = inp_t if inp_t == 'deseq2' else genelists_mgtype
    logger.info('Formatting differential genes from {} files. {} *.tsv files '
                'in {}:\n{} ... {}\n'.format(f_type, len(files), direc,
                                             *st_st))

    diffs = merge_files()
    if inp_t == 'genelist (up)' and get_down_genelists:
        # for genelist data with down mgs, run function recursively with a
        # single directory input, the down directory
        diffs.extend(_format_diff_genes(diff_genes_dir[1], 'down'))
    elif inp_t == 'genelist (down)':
        # inside recursion: exit
        return diffs
    return pd.concat(diffs, axis=1, sort=True).fillna(False).sort_index(axis=1)