Beispiel #1
0
 def merge_files():
     diffs = []
     for file in files:
         name = file[file.rfind('\\') + 1:-4]
         usecols = (0, 2, 6) if inp_t == 'deseq2' else None
         d = pd.read_csv(file,
                         sep='\t',
                         index_col=index_col,
                         usecols=usecols)
         if d.isna().any().any():
             spacer.info('')
             logger.warning(
                 '{} NaN values found and deleted in {}.tsv'.format(
                     d.isna().any(1).sum(), name))
         if inp_t == 'deseq2':
             sig = d.padj < config.DESEQ2_P
             up_reg = (sig & (d.log2FoldChange > 0)).rename(('up', name))
             down_reg = (sig & (d.log2FoldChange < 0)).rename(
                 ('down', name))
             diffs.append(pd.concat((up_reg, down_reg), axis=1))
         else:
             # genelists_mgtype is `down` in recursive function call
             s = pd.Series(True,
                           index=d.index,
                           name=(genelists_mgtype, name))
             diffs.append(s)
     return diffs
Beispiel #2
0
    def set_colors(self, colors, log=True):
        """Set the colors for elements in targets/samples

        Args:
            colors (dict:list): A mapping of element name to color or a list
                of colors that is assigned to the elements (in this order) until 
                its end. If the list has only one element, all elements are set 
                to this color. The color value must be a valid input for 
                matplotlib, i.e. 'w', '#ffffff',  (1, 1, 1) all refer to white.
            log (bool, optional): Log the set colors. Defaults to True.

        Note:
            Not contained element names are ignored. If the color is invalid,
            the element color is set to white.

        """
        if log:
            spacer.info('\n\n')
        if type(colors) is dict:
            inv = [col_key for col_key in colors if col_key not in self.names]
            if inv:
                logger.warning('Passed color mapping for `{}` contained keys '
                               'other than the element names: {}.\n'
                               .format(self.name, inv))
                colors = {k: colors[k] for k in colors if k in self.names}
        else:
            if len(colors) == 1:
                colors = dict.fromkeys(self.names, colors[0])
            else:
                colors = dict(zip(self.names, colors))

        inv_cs = set(tuple(filter(lambda c: not mpl_colors.is_color_like(c), 
                        list(colors.values()))))
        if any(inv_cs):
            [colors.update({k: 'w'}) for k, c in colors.items() if c in inv_cs]
            logger.warning('Invalid color values found: {}. Substituted with '
                           'default color: white.\n'.format(inv_cs))
        self._colors.update(colors)
        if log:
            str_cols = ',\n\t'.join([key+': '+str(col) 
                                    for key, col in self._colors.items()])
            logger.info('Colors set for `{}`:\n\t{}'.format(self.name, 
                                                            str_cols)) 
Beispiel #3
0
    def get_colors(self, order=None):
        """Get the targets/samples colors of `order`

        Args:
            order (list, optional): A listing of element names in which the 
            element colors will be returned. Defaults to the instances element 
            order. 
        
        Note:
            Not contained element names raise an error. If the color for the 
            requested element hasn't been set, the color is set to white. 
        
        Return:
            list of list of colors
        """
        if not self._colors:
            self.set_colors(['#ffffff'], log=False)
            logger.warning('Colors have not been set for `{}`. All have been '
                           'set to default: white'.format(self.name))
        if order is not None:
            not_ctnd = list(filter(lambda o: o not in self.names, order)) 
            if not_ctnd:
                logger.error('Failed to get colors of `{}`. The passed order '
                             'contains elements other than the element names: '
                             '{}'.format(self.name, not_ctnd))
                sys.exit(1)
            not_set = list(filter(lambda o: o not in self._colors, order)) 
            if not_set:
                self.set_colors(dict.fromkeys(not_set, '#ffffff'), log=False)
                logger.warning('Failed to get some colors of `{}`. The passed '
                               'order contains elements without a set color: {}'
                               '. Colors for these were set to default: white.'
                               .format(self.name, not_set))
        else:
            if order is None:
                order = self.names
        return [self._colors[k] for k in order]
Beispiel #4
0
def _check_args(trg,
                smp,
                metric,
                differential,
                hide_distance_bar=None,
                reorder_to_distance_bar=None,
                distance_bar_range=None,
                cluster_hmx=None,
                display_markergenes=False):
    """General purpose plot argument checker; returns (modified) input values"""
    def check_metric(metric, trg, smp, diff):
        # check if the samples and targets have equivalent data to compare
        if smp._type_name != 'samples':
            logger.error('The passed `samples` are not of type DPre.smaples.')
            sys.exit(1)
        if metric is None:
            if trg._has_expr and smp._has_expr:
                metric = 'cosine'
            elif trg._has_diff and smp._has_diff:
                metric = 'intersect'
            else:
                logger.error('Either initiate targets and samples with '
                             'expression or with markergenes and diff genes.')
                sys.exit(1)
        msg = 'The {} were initiated without {} data. Cannot use `{}` similarity.'
        if metric not in ('euclid', 'cosine', 'pearson', 'intersect'):
            logger.error('Invalid `metric` input: `{}`. Valid are `euclid`, '
                         '`cosine`, `pearson`, and `intersect`'.format(metric))
            sys.exit(1)
        if metric in ['euclid', 'cosine', 'pearson']:
            if not trg._has_expr:
                logger.error(msg.format('targets', 'expression', metric))
                sys.exit(1)
            elif not smp._has_expr:
                logger.error(msg.format('samples', 'expression', metric))
                sys.exit(1)
                if diff and not smp._ctrl:
                    logger.error(
                        'To plot the changes in transcriptional similarity '
                        'with metric = `{}`, the samples must be initiated '
                        'with a control. For absolute, pass differential = '
                        'False.'.format(metric))
                    sys.exit(1)
        elif metric == 'intersect':
            if not trg._has_diff:
                logger.error(msg.format('targets', 'merker gene', metric))
                sys.exit(1)
            elif not smp._has_diff:
                logger.error(msg.format('samples', 'diff genes', metric))
                sys.exit(1)

        return metric

    # checks for all plots
    metric = check_metric(metric, trg, smp, differential)
    if metric == 'intersect' and not differential:
        differential = True
        logger.warning('For the `intersect` similarity metric, '
                       'differential cannot be False. Was set to True.')

    # checks for 2 heatmaps
    if metric != 'intersect' and not hide_distance_bar and not smp._ctrl:
        hide_distance_bar = True
        logger.warning(
            '`hide_distance_bar` must be True '
            'for metric = `{}` if the samples data is '
            'initialized without a control. Set to True.'.format(metric))
    if reorder_to_distance_bar and hide_distance_bar:
        reorder_to_distance_bar = False
        logger.warning('When `reorder_to_distance_bar` is True, '
                       '`hide_distance_bar` cannot be True. Set '
                       'to False.')
    if reorder_to_distance_bar and cluster_hmx:
        cluster_hmx = False
        logger.warning('Both `reorder_to_distance_bar` and '
                       '`cluster_genes` were set as True. '
                       '`cluster_genes` will be ignored.')
    if not differential and distance_bar_range is not None:
        distance_bar_range = None
        logger.warning('The argument `distance_bar_range` is invalid '
                       'and ignored when differential = False. To apply'
                       ' a custom range, please use "heatmap_range".')

    if display_markergenes is not False:
        # checks for target_sim and ranked_sim plots
        val = ['mean', 'up', 'down']
        if display_markergenes not in val:
            logger.warning('Invalid input for display_markergenes: `{}`. '
                           'Valid are {}. Set to default `{}`'.format(
                               display_markergenes, val, val[0]))
            display_markergenes = val[0]

        if display_markergenes == val[2] and not trg._down_mgs:
            logger.error('Cannot display down markergene similarity because'
                         ' the targets were not initiated with down '
                         'markergenes.')
            sys.exit(1)

    return metric, differential, hide_distance_bar, reorder_to_distance_bar, \
           distance_bar_range, cluster_hmx, display_markergenes
Beispiel #5
0
def _format_expr(expr, type_name, ctrl=None):
    """ Take user expression input validate and format

    If a TSV file is passed, read the expresion file as a DataFrame. Check
    if the DataFrame has a valid format. If the control is passed, check if it's
    found in expression. Finally, generate and add the log2- and z-transformed 
    data.

    Args:
        expr: Filename or Dataframe. The data to check.
        type_name: 'targets' or 'samples', depending on caller
        ctrl: Control name, only passed when called from samples
    
    Returns:
        expr: Expression DataFrame with log2- and z-transformed data at column 
            level 1

    """
    if not isinstance(expr, pd.DataFrame):
        if not os.path.exists(expr):
            spacer.info('')
            logger.error('Invalid path: {}\n'.format(os.path.abspath(expr)))
            sys.exit(1)

        expr = pd.read_csv(expr, sep='\t')
        if 'ensg' in expr.columns:
            expr.set_index('ensg', inplace=True)
        else:
            expr.set_index(expr.columns[0], inplace=True)

    met = [
        c for c in ('loc', 'name', 'tss_loc', 'strand') if c in expr.columns
    ]
    if met:
        expr.drop(met, axis=1, inplace=True)
    inv = expr.columns[expr.dtypes == object].tolist()
    if inv:
        spacer.warning('\n')
        logger.warning('Invalid columns of datatype `object` (often text) '
                       'in expression data: {}\nThese columns will be '
                       'removed.'.format(inv))
        expr.drop(inv, axis=1, inplace=True)
    isna = expr.isna()
    if isna.any().any():
        spacer.error('\n')
        logger.error('Invalid expression data: data contains NaN values.')
        sys.exit(1)
    elif ctrl and (ctrl not in expr.columns.unique(0)):
        spacer.error('\n')
        logger.error('The control name of the samples `{}` was not found in '
                     'the passed expression data.'.format(ctrl))
        sys.exit(1)

    if expr.columns.nlevels > 1:
        exp_idx = [(name, dt) for name in expr.columns.unique(0)
                   for dt in ['log2', 'z']]
        idx = expr.columns.values.tolist()
        misma = list(filter(lambda i: i not in exp_idx, idx))
        if any(misma):
            spacer.error('')
            msg = (
                '\tInvalid expresion data. When passing data with log2- and '
                'z-data, the columns must be a MultiIndex in which level 0 '
                'holds the names: [`name1`, ...] and level 1 the data types:'
                ' [`log2`, `z`]. Expected column indices ({}):\n\t\t{}\n\t '
                'Passed, unexpected column indices ({}):\n\t\t{}'.format(
                    len(exp_idx), exp_idx, len(misma), misma))
            logger.error(msg)
            sys.exit(1)
        else:
            return expr
    else:
        return util._add_log2_z(expr)
Beispiel #6
0
def preset_targets(get,
                   sort=False,
                   preset_colors=True,
                   color_legend_filename=True,
                   color_legend_ncols=1):
    """Generate one of the predefined targets instances and return it. 
    
        Pick a reference dataset for comparison. Mouse (Hutchins et al. 2017, 
        NAR) and Human (Abugessaisa et al. 2017, FANTOM5 project) are included. 
        Specific doamins can be picked for both species references. If the 
        targets are initiated with 'preset_colors', a color legend is generated 
        and saved in the current working directory. Custom presets can be 
        created by adding a folder (with an 'm' or 'h' prefix) to 
        DPre/preset_targets. 

    Args:
        get (str): the desired preset. Valid options are 'mouse', 'human', 
            'm embryonic', 'm germ cells', 'm neural crest', 
            'm surface ectoderm', 'm neuroectoderm', 'm mesoderm', 'm endoderm', 
            'm blood mesoderm', 'h surface ectoderm', 'h neuroectoderm', 
            'h mesoderm', 'h endoderm', 'h blood mesoderm'; m = mouse, h = human
        sort (bool, optional): Sort the loaded element names alphabetically. 
            Defaults to False.
        preset_colors (bool, optional): Tries to initiate the targets with preset 
            colors either from colors.tsv in the respective preset directory or 
            when not found from config.preset_targets_colors. Defaults to True.
        color_legend_filename (bool, str, optional): The filename when a preset
            color legend is drawn from config.preset_col_legend. When True, a
            filename is inferred from the targets name and config.SAVE_FORMAT, 
            a str is set as the filename. Defaults to True. When None, the color
            legend is not drawn.
        color_legend_ncols (int, optional): Number of columns in the color 
            legend. Defaults to 1.
    
    Returns:
        t: the preset targets instance
    """
    path = os.path.dirname(__file__)
    # any folder in DPre/preset_targets is potentially valid
    valid = os.listdir(os.path.join(path, '..', 'preset_targets'))
    if get not in valid:
        spacer.info('')
        logger.error(
            '`{}` is not a valid preset target. Valid ones are {}'.format(
                get, valid))
        sys.exit(1)

    # try to get .gzip markergene and expression input, if not found try .tsv
    get_dir = '{}/../preset_targets/{}'.format(path, get)
    expr = mgs = None
    if os.path.exists('{}/markergenes.gzip'.format(get_dir)):
        mgs = pd.read_pickle('{}/markergenes.gzip'.format(get_dir))
    elif os.path.exists('{}/markergenes.tsv'.format(get_dir)):
        mgs = pd.read_csv('{}/markergenes.tsv'.format(get_dir),
                          sep='\t',
                          index_col=0,
                          header=[0, 1])
        mgs.to_pickle('{}/markergenes.gzip'.format(get_dir))

    if os.path.exists('{}/expression.gzip'.format(get_dir)):
        expr = pd.read_pickle('{}/expression.gzip'.format(get_dir))
    elif os.path.exists('{}/expression.tsv'.format(get_dir)):
        expr = pd.read_csv('{}/expression.tsv'.format(get_dir),
                           sep='\t',
                           index_col=0,
                           header=[0, 1])
        expr.to_pickle('{}/expression.gzip'.format(get_dir))
    elif os.path.exists('{}/expression_h1.gzip'.format(get_dir)):
        expr1 = pd.read_pickle('{}/expression_h1.gzip'.format(get_dir))
        expr2 = pd.read_pickle('{}/expression_h2.gzip'.format(get_dir))
        expr = pd.concat([expr1, expr2], axis=1)
        expr.to_pickle('{}/expression.gzip'.format(get_dir))

    if sort:
        mgs.sort_index(axis=1, inplace=True)
        expr.sort_index(axis=1, inplace=True)

    # explicit part of the script that might need adjustmet for cumstom presets
    if get == 'human':
        args = {'name': 'human FANTOM5 library', 'species': 'human'}
    elif get == 'mouse':
        args = {'name': 'mouse lineages', 'species': 'mouse'}
    elif get.startswith('h '):
        args = {'name': get[2:] + ' lineage', 'species': 'human'}
    elif get.startswith('m '):
        args = {'name': get[2:] + ' lineage', 'species': 'mouse'}
    # init targets
    from DPre.main.targets import targets
    t = targets(markergenes=mgs, expression=expr, log=False, **args)
    logger.info(
        'Default targets `{}` created, name: `{}`, elements: {}'.format(
            get, t.name, len(t)))

    # try to get colors first through a file, then through config
    if preset_colors:
        try:
            df_colors = pd.read_csv('{}/colors.tsv'.format(get_dir),
                                    sep='\t',
                                    index_col=0)
            t.set_colors(dict(zip(df_colors.index, df_colors.color)),
                         log=False)
        except FileNotFoundError:
            if get in config.preset_targets_colors:
                t.set_colors([config.preset_targets_colors[get]], log=False)
            else:
                logger.warning(
                    'No colors found for preset targets {}'.format(get))
        # draw a colorlegend if defined in config
        if get in config.preset_col_legend and color_legend_filename:
            filename = get+'_color_legend' if color_legend_filename == True \
                       else color_legend_filename
            util.plot_color_legend(*config.preset_col_legend[get],
                                   ncolumns=color_legend_ncols,
                                   filename=filename)
    return t
Beispiel #7
0
    def _is_expr_diff_compatible(self, override_namematcher=False):
        """Check if expression and differential input share element labels

        Args:
            override_namematcher: Whether a misalignment between expression- and
                gene list names should be ignored. Useful when names refer to
                the same data but labels differ partially.
        """
        expr_ns = self.names
        diff_ns = self._diff.columns.unique(-1).tolist()
        diff_n = 'markergenes' if self._type_name == 'targets' else 'diff. genes'
        spacer.info('\n\n')
        if len(expr_ns) != len(diff_ns):
            if (len(expr_ns) == len(diff_ns)+1) and self._type_name == 'samples':
                msg = ('{} ({}) has one element less than expression ({}). '
                       .format(diff_n, len(diff_ns), len(expr_ns)))
                if self._ctrl:
                    # assume it is the contrl missing
                    msg += ('An empty element `{}` (control) will be added to '
                            'diff. genes.'.format(self._ctrl))
                    logger.info(msg)

                    # add control to match with expression element names
                    self._diff[('up', self._ctrl)] = False
                    self._diff[('down', self._ctrl)] = False
                    self._diff.sort_index(axis=1, inplace=True)
                    diff_ns = self._diff.columns.unique(-1).tolist()
                else:
                    msg += ('If the expression data has a control that is '
                            'missing in diff. genes, you can resolve this by '
                            'passing the control name for samples initiation.')
                    logger.info(msg)
                    sys.exit(1)

            else:
                logger.error('Passed expression ({}) and {} ({}) do not have '
                             'the same number of elements. Check input.'
                             .format(len(expr_ns), diff_n, len(diff_ns)))
                sys.exit(1)
        
        align = [e_n == d_n for e_n, d_n in zip(expr_ns, diff_ns)]
        df = pd.DataFrame({'expression names': expr_ns, 
                           diff_n + ' names': diff_ns, 
                           'match': align})
        if all(align) or override_namematcher:
            spacer.info('')
            msg = diff_n + ' names have been overriden by expression names. '
            lvls = self._diff.columns.unique(0), expr_ns
            self._diff.columns = pd.MultiIndex.from_product(lvls)
            if override_namematcher:
                logger.warning('{}CAUTION: {}- and expression names '
                               'do not match for all elements:\n{}\nMake sure '
                               'data aligns to avaid mislabeling!'
                               .format(msg, diff_n, df.to_string()))
        else:
            spacer.error('')
            logger.error(('{0}- and expression element names do '
                          'not match:\n{1}\nRename elements in expression '
                          '/{0} input files or override the '
                          '{0} names by setting `override_namematcher` '
                          'to True.'.format(diff_n, df.to_string())))
            sys.exit(1)