def _nat_order_labels(self, mat, labels):
        # get the natural order indices
        natindices = index_natsorted(labels)

        # order the matrix
        ordered_mat = np.array(order_by_index(mat, natindices))
        ordered_labels = np.array(order_by_index(labels, natindices))
        return ordered_mat, ordered_labels
Beispiel #2
0
def test_order_by_index():

    # Return the indexes of how the iterable would be sorted.
    a = ['num3', 'num5', 'num2']
    index = [2, 0, 1]
    assert order_by_index(a, index) == ['num2', 'num3', 'num5']
    assert order_by_index(a, index) == [a[i] for i in index]
    assert order_by_index(a, index, True) != [a[i] for i in index]
    assert list(order_by_index(a, index, True)) == [a[i] for i in index]
Beispiel #3
0
    def natsort_contacts(self):
        """
        Natural sort out the time series by its channel labels.

        For example:

            A1,A2, ..., B1, B2, ..., Z1, Z2, ..., A'1, A'2, ...
        """
        self.buffchanlabels = self.chanlabels.copy()
        natinds = self.contacts.natsort_contacts()
        self.mat = np.array(order_by_index(self.mat, natinds))
        self.metadata["chanlabels"] = np.array(
            order_by_index(self.chanlabels, natinds))
Beispiel #4
0
 def prepare(df):
     # Sort columns
     df = df.sort_index(axis=1)
     # Natsort all rows
     df = df.reindex(index=order_by_index(df.index, index_natsorted(zip(df.to_numpy()))))
     # Recreate index for comparison later
     return df
Beispiel #5
0
def get_data(label, max_items):
    # generate some random data
    items = ['%s_%d' % (label, i + 1) for i in list(range(max_items))]
    x1 = np.random.randint(low=1, high=100, size=max_items).tolist()
    x2 = np.random.randint(low=1, high=100, size=max_items).tolist()
    x3 = np.random.randint(low=1, high=100, size=max_items).tolist()

    # insert NAs to the first row
    items.insert(0, NA_ID)
    x1.insert(0, NA_TEXT)
    x2.insert(0, NA_TEXT)
    x3.insert(0, NA_TEXT)

    # create pandas dataframe
    colname = '%s_pk' % label
    data = {colname: items, 'x1': x1, 'x2': x2, 'x3': x3}
    df = pd.DataFrame(data)

    # https://stackoverflow.com/questions/29580978/naturally-sorting-pandas-dataframe
    df = df.reindex(
        index=order_by_index(df.index, index_natsorted(df[colname])))

    # create bokeh ColumnDataSource and DataTable
    columns = [
        TableColumn(field=colname, title='ID'),
        TableColumn(field='x1', title='x1'),
        TableColumn(field='x2', title='x2'),
        TableColumn(field='x3', title='x3'),
    ]
    ds = ColumnDataSource(df)
    dt = DataTable(source=ds, columns=columns, width=300, height=300)
    return df, ds, dt
Beispiel #6
0
    def natsort_contacts(self):
        """
        Sort out the time series by its channel labels, so they go in
        a natural ordering.

        A1,A2, ..., B1, B2, ..., Z1, Z2, ..., A'1, A'2, ...

        :return:
        """
        print("Trying to sort naturally contacts in result object")

        self.buffchanlabels = self.chanlabels.copy()
        # pass
        natinds = self.contacts.natsort_contacts()
        self.mat = np.array(order_by_index(self.mat, natinds))
        self.metadata['chanlabels'] = np.array(
            order_by_index(self.chanlabels, natinds))
 def prepare(self, df):
     # Sort columns
     df = df.sort_index(axis=1)
     # Natsort all rows
     df = df.reindex(index=order_by_index(df.index, index_natsorted(zip(df.to_numpy()))))
     # Recreate index for comparison later
     df.reset_index(level=0, drop=True, inplace=True)
     return df
def natural_sort(df, by='id', index=False):
    '''
    Sort a pandas dataframe "naturally" by column or by index.
    '''
    if index:
        return df.reindex(index=natsorted(df.index))

    return df.reindex(index=order_by_index(df.index, index_natsorted(df[by])))
Beispiel #9
0
def bar_plot_mmgbsa_results(excel_file, sort=True, titles=None):
    """
    Load data from an Excel file with the summary of the MMGBSA results, in a sheet which has to be called "MMGBSA".
    Create a plot for each ligand that is found under the 'Ligand' column in the table.
    :param excel_file: str, Name of the Excel file with the data
    :param sort: bool, Whether to sort the plot by increasing MMGBSA values
    :param titles: list, Name for each of the plots (as many as there are ligands in the table)
    :return f_list: list, A list of matplotlib figures
    """
    # sns.set_style('whitegrid')
    df = pd.read_excel(excel_file, sheetname="MMGBSA")
    df = df.reindex(index=order_by_index(df.index, index_natsorted(df.Run)))
    ligands = df.Ligand.unique()
    f_list = []
    if titles is None:
        titles = [None for _ in ligands]
    elif len(titles) != len(ligands):
        raise ValueError('len of ligands and titles is not equal.')
    for lig, title in zip(ligands, titles):
        lig_df = df[df.Ligand == lig]
        if sort:
            lig_df.sort_values(by='MMGBSA (mean)', inplace=True)
        ax = lig_df.plot(x="Run", y="MMGBSA (mean)", yerr='MMGBSA (Std)', kind='bar',
                         legend=False, figsize=figure_dims(1400), title=title)
        overall_mean = lig_df['MMGBSA (mean)'].mean()
        overall_std = lig_df['MMGBSA (mean)'].std()
        print("{} {:02f} {:02f}".format(lig, overall_mean, overall_std))
        xmin, xmax = ax.get_xlim()
        # Mean line
        ax.plot(
            [xmin, xmax], [overall_mean, overall_mean],
            linewidth=1.5,
            color='blue'
        )
        # Upper std bar
        ax.plot(
            [xmin, xmax],
            [overall_mean + overall_std, overall_mean + overall_std],
            linestyle='dashed',
            linewidth=1,
            color='blue'
        )
        # Lower std bar
        ax.plot(
            [xmin, xmax],
            [overall_mean - overall_std, overall_mean - overall_std],
            linestyle='dashed',
            linewidth=1,
            color='blue'
        )
        ax.set_ylim(top=0)
        ax.set_ylabel(ylabel=r'$\Delta$G binding (kcal/mol)', size=14)
        ax.set_xlabel(xlabel='Run', size=14)
        f = pp.gcf()
        f.tight_layout()
        f_list.append(f)
    return f_list
Beispiel #10
0
    def sort(self, column, order):
        self.layoutAboutToBeChanged.emit()
        if order == 0:
            self._dataframe = self._dataframe.reindex(index=order_by_index(
                self._dataframe.index,
                index_natsorted(
                    eval('self._dataframe.%s' %
                         (list(self._dataframe.columns)[column])))))
        else:
            self._dataframe = self._dataframe.reindex(index=order_by_index(
                self._dataframe.index,
                reversed(
                    index_natsorted(
                        eval('self._dataframe.%s' %
                             (list(self._dataframe.columns)[column]))))))

        self._dataframe.reset_index(inplace=True, drop=True)
        self.setDataFrame(self._dataframe)
        self.layoutChanged.emit()
Beispiel #11
0
def avgPrefs(prefsfiles):
    """Gets average of site-specific preferences.

    Args:
        `prefsfiles` (list)
            List of CSV files containing preferences, must all be
            for same sites and characters.

    Returns:
        A `pandas.DataFrame` containing the average of the
        preferences in `prefsfiles`. In this returned
        data frame, `site` is the index

    >>> tf1 = tempfile.NamedTemporaryFile
    >>> tf2 = tempfile.NamedTemporaryFile
    >>> with tf1(mode='w') as file1, tf2(mode='w') as file2:
    ...     x = file1.write('site,A,C,G,T\\n'
    ...                 '10,0.2,0.2,0.5,0.1\\n'
    ...                 '2a,0.3,0.3,0.3,0.1')
    ...     file1.flush()
    ...     x = file2.write('site,A,C,G,T\\n'
    ...                 '10,0.4,0.1,0.1,0.4\\n'
    ...                 '2a,0.3,0.4,0.1,0.2')
    ...     file2.flush()
    ...     avg = avgPrefs([file1.name, file2.name])
    >>> (avg['site'] == ['2a', '10']).all()
    True
    >>> numpy.allclose(avg['A'], [0.3, 0.3])
    True
    >>> numpy.allclose(avg['C'], [0.35, 0.15])
    True
    >>> numpy.allclose(avg['G'], [0.2, 0.3])
    True
    >>> numpy.allclose(avg['T'], [0.15, 0.25])
    True
    """
    assert len(prefsfiles) >= 1
    prefs = [
        pandas.read_csv(f, index_col='site').sort_index() for f in prefsfiles
    ]

    # make sure all have the same columns in the same order
    cols = prefs[0].columns
    for i in range(len(prefs)):
        assert set(cols) == set(prefs[i].columns)
        prefs[i] = prefs[i][cols]

    avgprefs = pandas.concat(prefs).groupby('site').mean().reset_index()

    # natural sort by site: https://stackoverflow.com/a/29582718
    avgprefs = avgprefs.reindex(index=natsort.order_by_index(
        avgprefs.index, natsort.index_natsorted(avgprefs.site, signed=True)))

    return avgprefs
Beispiel #12
0
def main():
    """Main function for pyim-annotate."""
    args = parse_args()

    insertions = Insertion.from_csv(args.insertions, sep='\t')

    annotator = args.caller.from_args(args)
    annotated = list(annotator.annotate(insertions))

    annotated_frame = Insertion.to_frame(annotated)

    annotated_frame = annotated_frame.reindex(index=order_by_index(
        annotated_frame.index, index_natsorted(annotated_frame.id)))

    annotated_frame.to_csv(str(args.output), sep='\t', index=False)
Beispiel #13
0
    def checkdf(self, p, x):
        """
        :return: merged dataframes
        """
        labels = x.keys()
        colnames = ['standard_hue', 'ntrial', 'all_intensities', 'all_responses', 'reversal value']
        df = pd.DataFrame(columns=colnames, index=labels)

        for label in labels:
            sheet = x[label]
            df.loc[label, colnames[0]] = float(p[p['label'] == label]['standard'])
            df.loc[label, colnames[1]] = len(sheet)
            df.loc[label, colnames[2]] = np.array((sheet['All Intensities']))
            df.loc[label, colnames[3]] = np.array((sheet['All Responses']))
            df.loc[label, colnames[4]] = np.array((sheet['Reversal Intensities']))
            df = df.reindex(index=order_by_index(df.index, index_natsorted(df.index, reverse=False)))
        return df
Beispiel #14
0
    def sumxrl(self):

        par, xls, count = self.readxrl()

        dfs = pd.concat([self.checkdf(p, x) for p, x in zip(par, xls)], axis=0)

        summary = dfs.groupby(level=0).agg({
            'standard_hue':
            'unique',
            'ntrial':
            lambda x: sum(x) / count,
            'reversal value': [self.meanvalue, self.stdvalue],
            'all_responses': [self.meanvalue, self.stdvalue]
        })
        summary = summary.reindex(index=order_by_index(
            summary.index, index_natsorted(summary.index, reverse=False)))

        return dfs, summary
Beispiel #15
0
def _get_stranded_f(self, half_entries, f, sort=False):

    counter = 0
    dfs = []

    chromosomes = self.chromosomes

    if f == "tail":
        chromosomes = reversed(chromosomes)

    default = pd.DataFrame(columns=self.columns)
    for chromosome in chromosomes:
        plus = self.dfs.get((chromosome, "+"), default)
        minus = self.dfs.get((chromosome, "-"), default)

        if sort:
            plus = plus.sort_values(sort_cols)
            minus = minus.sort_values(sort_cols)

        plus = getattr(plus, f)(half_entries)
        minus = getattr(minus, f)(half_entries)

        df = pd.concat([plus, minus])
        if sort:
            df = df.sort_values(sort_cols)

        counter += len(df)

        dfs.append(df)

        if counter >= half_entries:
            break

    df = pd.concat(dfs)
    # got twice as many entries as needed before sort. Halve here:
    df = getattr(df, f)(half_entries)

    # dfs = {df.Chromosome.iloc[0]: df for df in}
    df = df.reset_index(drop=True)
    df = df.reindex(index=natsort.order_by_index(
        df.index, natsort.index_natsorted(zip(df.Chromosome))))

    return df
Beispiel #16
0
def main(options):
    # open all files
    # read all headers
    fnames = options.inbw
    bw = {}
    header = {}
    chroms = {}
    for fname in fnames:
        print("opening file for input: " + os.path.split(fname)[1])
        bw[fname] = pyBigWig.open(fname)
        header[fname] = bw[fname].chroms()
        chroms[fname] = list(header[fname].keys())[0]

    # define order based on chromosome names extracted from headers
    idx = natsort.index_natsorted(chroms)
    fnames = natsort.order_by_index(fnames, idx)

    # open bigwig for output
    print("opening bigwig file for output (%s)" % options.outfname)
    out_bw = pyBigWig.open(options.outfname, "w")
    assert (out_bw is not None)
    # construct sorted header and write to output
    header = [list(header[f].items())[0] for f in fnames]
    print(str(header))
    out_bw.addHeader(header)

    # loop over sorted chromosome-names/file-names
    ## import data from input bw
    ## add read data to output bw
    for fname in fnames:
        print("exporting data from chrom; " + chroms[fname] + " (file: " +
              fname + ")")
        ints = bw[fname].intervals(chroms[fname])
        chrs = [chroms[fname]] * len(ints)
        out_bw.addEntries(chrs, [i[0] for i in ints],
                          ends=[i[1] for i in ints],
                          values=[i[2] for i in ints])

    print("closing bigwig file for output (%s)" % options.outfname)
    out_bw.close()

    return True
Beispiel #17
0
    def natsort_contacts(self) -> Tuple:
        """
        Naturally sort the contacts.

        Keeps the applied indices in self.naturalinds

        Returns
        -------
        naturalinds
        """
        if self.naturalinds == None:
            self.naturalinds = index_natsorted(self.chanlabels)
            self.chanlabels = np.array(
                order_by_index(self.chanlabels, self.naturalinds))
        else:
            warnings.warn(
                "Already naturally sorted contacts! Extract channel labels naturally sorted by calling "
                "chanlabels, and apply ordering to other channel level data with naturalinds."
            )
        return self.naturalinds
def convert_clusters(clusters, cluster_size_min, cluster_size_max, out_path, normalise):
    '''
    Args:
        clusters(Object): Clusters object
        cluster_size_min(int): minimum size of cluster (more than or equal to int)
        cluster_size_max(int): maximum size of cluster (less than or equal to int)
        out_path(str): Out file path
    '''
    assert cluster_size_min > 1, 'Minimum cluster size needs to be > 1'

    all_clusters = []
    for barcode, cluster in clusters.get_items():
        cs = cluster.size('DPM')
        if cs >= cluster_size_min and cs <= cluster_size_max:
            all_clusters.extend(cluster2sfws(cluster, 'DPM', normalise))

    column_names=['str1', 'chr1', 'pos1', 'frag1', 'str2', 'chr2', 'pos2','frag2', 'score']
    df = pd.DataFrame(all_clusters, columns=column_names) 
    df_out = df.reindex(index=order_by_index(df.index, index_natsorted(zip(df.chr1, df.chr2, df.pos1, df.pos2))))
    df_out.to_csv(out_path, sep=' ', index=False, header=False)
Beispiel #19
0
    def write_click_csv(self, n_frames):

        out = pd.DataFrame(self.mouse_clicks)
        out = out.reindex(index=order_by_index(
            out.index, index_natsorted(out['frame'], reverse=False)))

        out = out.assign(time=[0]*out.shape[0])
        out = out.assign(visible=[1]*out.shape[0])

        cols = ['frame', 'time', 'visible', 'x', 'y']
        out = out[cols]

        # reindex or change the order of columns
        str_ = out.to_csv(sep=';', index=False)
        h, w = self.frames[0].shape[:2]
        header = 'VideoWidth:{}\nVideoHeight:{}\nDisplayWidth:0\nDisplayHeight:0\n'.format(w, h)
        str_ = header + str_
        text_file = open(self.out_csv, 'w')
        n = text_file.write(str_)
        text_file.close()
        print('written {}'.format(self.out_csv))
Beispiel #20
0
def list_of_names(fname='SavedData/searchResult.php'):
    names = []
    ulist = []
    bs = BeautifulSoup(open(fname), features='lxml')
    pbar = ProgressBar()
    for species in pbar(bs.findAll('i')):
        for parents in species.parents:
            if parents.name == 'td':
                [names.append(name) for name in species.contents]

    [ulist.append(x) for x in names
     if x not in ulist]  # deleting double values

    df = pd.DataFrame()  # create dataframe
    df['name'] = ulist

    df = df.reindex(index=order_by_index(
        df.index, index_natsorted(df['name'],
                                  reverse=False)))  # sort alphabetically
    df = df.reset_index(drop=True)  # fix index
    return df
def cluster2sfws(cluster, read_type, normalise=True):
    '''Convert a cluster class object (a single cluster)
    to a dictionary in the sfws format (Juicer tools Pre format with score)
    
    Note:
        Juicer short format with score (sfws)
        A whitespace separated file that contains, on each line
            <str1> <chr1> <pos1> <frag1> <str2> <chr2> <pos2> <frag2> <score>
        https://github.com/aidenlab/juicer/wiki/Pre

        IMPORTANT NOTE pre throws away reads that map to the same restriction fragment. 
        If you use dummy numbers for the frag field, be sure they are different for the 
        different read ends; that is, <frag1> should be 0 and <frag2> should be 1.

        str = strand (0 for forward, anything else for reverse)

    Args:
        cluster(Cluster): A single Cluster object that holds all the position (reads)
        read_type(str): RPM or DPM
    '''

    cluster_pos = []
    for position in cluster:
        if position._type == read_type:
            cluster_pos.append(('0' if position._strand == '+' else '1',
                                position._chromosome,
                                position._start_coordinate))
    
    pairs = list(combinations(cluster_pos, 2))
    if normalise:
        score = 2.0 / len(cluster_pos)
    else:
        score = 1
    out = []
    for a, b in pairs:
        # chr1 > chr2 order
        a, b = order_by_index([a, b], index_natsorted([a[1], b[1]]))
        out.append([*a, 0, *b, 1, score]) 
    return out
Beispiel #22
0
def tidyToWide(tidy_df, valuecol):
    """Converts tidy `diffsel` data frame to wide form.

    The `diffsel` data frames returned by ``dms2_diffsel`` (and
    also other dataframes, such as the `fracsurvive` ones
    from ``dms_fracsurvive`` are in tidy form. This function
    converts them to wide form.

    Args:
        `tidy_df` (pandas DataFrame)
            Data frame in tidy form. Should have columns named
            `site`, `wildtype`, `mutation`, and something
            with the name matching `valuecol`.
        `valuecol` (string)
            Name of value column in `df`, such `diffsel` or
            `fracsurvive`.

    Returns:
        Wide form dataframe. Will have columns `site` (as string), 
        `wildtype`, and all characters (e.g., amino acids)
        for which values are given. Natural sorted by `site`.

    >>> tidy_df = pandas.read_csv(io.StringIO(
    ...     '''site wildtype mutation diffsel
    ...           3        A        D    -1.5 
    ...           3        A        C    10.1
    ...           2        A        C    10.1
    ...           1        C        D     9.5
    ...           1        C        A     0.2
    ...           2        A        D    -1.5'''),
    ...     delim_whitespace=True, index_col=False)
    >>> wide_df = tidyToWide(tidy_df, valuecol='diffsel')
    >>> print(wide_df.to_string(float_format=lambda x: '{0:.1f}'.format(x)))
      site   A    C    D wildtype
    0    1 0.2  0.0  9.5        C
    1    2 0.0 10.1 -1.5        A
    2    3 0.0 10.1 -1.5        A
    """
    assert isinstance(tidy_df, pandas.DataFrame)
    cols = ['site', 'wildtype', 'mutation', valuecol]
    assert set(cols) == set(tidy_df.columns), ('expected columns '
            '{0}\nactual columns {1}'.format(cols, tidy_df.columns))

    # make site a string
    tidy_df['site'] = tidy_df['site'].astype(str)

    # sort on site as here: https://stackoverflow.com/a/29582718
    tidy_df = tidy_df.reindex(index=natsort.order_by_index(tidy_df.index,
            natsort.index_natsorted(tidy_df.site, signed=True)))

    # convert to wide form, keeping wildtype identities
    tidy_df = tidy_df.set_index('site', drop=True)
    wt = tidy_df['wildtype']
    wide_df = (tidy_df.pivot(columns='mutation', values=valuecol)
                      .fillna(0.0)
                      .join(wt)
                      .reset_index()
                      )
    wide_df = wide_df.drop_duplicates().reset_index(drop=True)

    return wide_df
Beispiel #23
0
    plt.ylabel('')

    plt.legend(fontsize=50)
    plt.savefig('../outputs/{}'.format(outputname), 
            bbox_inches="tight", 
            dpi=300,
            format='png')

counts = df.groupby(['label','source']).count()[['key']].reset_index()
counts.source = counts.source.apply(lambda x: 'Kaggle' if x=='kg'
                                    else 'StackOverflow')
total_kg, total_so = df.groupby('source').count().topic.tolist()
counts['counts'] = counts.key
counts['key'] = counts.apply(lambda x: x.key/total_kg if x.source=='Kaggle' 
              else x.key/total_so, axis=1)
counts = counts.reindex(index=order_by_index(counts.index, index_natsorted(counts.label)))
hist(counts)

#%%#######################################################################
#                           Topic Relationships                          #
##########################################################################
str_contains = 'kg'

gephi = df[['key','topics']]
gephi = gephi.explode('topics')
gephi['topic'] = gephi.topics.apply(lambda x: x[0])
gephi['value'] = gephi.topics.apply(lambda x: x[1])
gephi = gephi.drop(columns=['topics'])
gephi = gephi[gephi.value>0.15]
nodes = pd.DataFrame(set(gephi.topic.tolist() + gephi.key.tolist()), columns=['Id'])
nodes['Color'] = nodes.Id
Beispiel #24
0
def test_order_by_index_returns_generator_with_iter_True():
    a = ['num3', 'num5', 'num2']
    index = [2, 0, 1]
    assert order_by_index(a, index, True) != [a[i] for i in index]
    assert list(order_by_index(a, index, True)) == [a[i] for i in index]
Beispiel #25
0
def test_order_by_index_sorts_list_according_to_order_of_integer_list():
    a = ["num3", "num5", "num2"]
    index = [2, 0, 1]
    assert order_by_index(a, index) == ["num2", "num3", "num5"]
    assert order_by_index(a, index) == [a[i] for i in index]
Beispiel #26
0
    def __init__(self, input_path, organism, args):
        self.testing = args.test
        if os.path.isdir(input_path):
            self.beds = []
            self.bednames = []
            for dirpath, dnames, fnames in walklevel(input_path, level=0):
                for f in fnames:
                    if f.endswith(".bed"):
                        name = os.path.basename(f).replace(".bed", "")
                        bed = GenomicRegionSet(name)
                        bed.read(os.path.join(dirpath, f))
                        if args.test:
                            bed.sequences = bed.sequences[0:10]
                        bed.sort()
                        self.beds.append(bed)
                        self.bednames.append(name)

            index = natsort.index_natsorted(self.bednames)
            self.beds = natsort.order_by_index(self.beds, index)
            self.bednames = natsort.order_by_index(self.bednames, index)

        elif os.path.isfile(input_path):
            if input_path.endswith(".bed"):
                name = os.path.basename(input_path).replace(".bed", "")
                bed = GenomicRegionSet(name)
                bed.read(input_path)
                if args.test:
                    bed.sequences = bed.sequences[0:10]
                bed.sort()
                self.beds = [bed]
                self.bednames = [name]
            else:
                self.EM = ExperimentalMatrix()
                self.EM.read(input)
                self.beds = self.EM.get_regionsets()
                self.bednames = self.EM.get_regionsnames()
        else:
            print("***Please make sure that there are BED files in " + input_path)
            sys.exit(1)

        self.organism = organism
        self.chromosomes = GenomicRegionSet(organism)
        self.chromosomes.get_genome_data(organism=organism, chrom_X=True)
        genome = GenomeData(organism=organism)
        self.fasta_dir = genome.get_genome()
        self.stats = OrderedDict()
        self.ind_col = {}
        size_panel = 6
        rows = len(self.beds)
        cols = 2
        if args.biotype:
            self.ind_col["Biotype"] = cols
            cols += 1
        if args.repeats:
            self.ind_col["Repeats"] = cols
            cols += 1
        if args.genposi:
            self.ind_col["Genetic position"] = cols
            cols += 1
        if args.labels:
            for label in args.labels:
                self.ind_col[label] = cols
                cols += 1
        self.fig_f, self.fig_axs = plt.subplots(rows + 1, cols, dpi=300, figsize=(cols * size_panel, rows * size_panel))
        self.table_h = {}
        self.tables = {}
        self.count_table = {}
        self.count_tableh = []
        for i, bed in enumerate(self.beds):
            self.table_h[self.bednames[i]] = [self.bednames[i]]
            self.tables[self.bednames[i]] = []
            self.tables[self.bednames[i]].append([r.toString() for r in bed])
            self.table_h[self.bednames[i]].append("strand")
            self.tables[self.bednames[i]].append([r.orientation if r.orientation else "." for r in bed])
            self.count_table[bed.name] = {}
        if args.coverage:
            self.coverage = True
        else:
            self.coverage = False
        self.background = []
Beispiel #27
0
import os
import numpy as np
import natsort
import vtk

import sys

dataPath = sys.argv[1]
outputfile = sys.argv[2]
for dirName, subDir, fileList in os.walk(dataPath):
    pass
index = natsort.index_natsorted(fileList)
fileList = natsort.order_by_index(fileList, index)

stringArray = vtk.vtkStringArray()
for i, fileName in enumerate(fileList):
    stringArray.InsertNextValue(fileName)

reader = vtk.vtkDICOMImageReader()
reader.SetDirectoryName(dataPath)
reader.SetFileNames(stringArray)
writer = vtk.vtkMetaImageWriter()
writer.SetInputConnection(reader.GetOutputPort())
writer.SetFileName(outputfile + '.mhd')
writer.Write()
def test_order_by_index_sorts_list_according_to_order_of_integer_list():
    given = ["num3", "num5", "num2"]
    index = [2, 0, 1]
    expected = [given[i] for i in index]
    assert expected == ["num2", "num3", "num5"]
    assert order_by_index(given, index) == expected
def test_order_by_index_returns_generator_with_iter_true():
    given = ["num3", "num5", "num2"]
    index = [2, 0, 1]
    assert order_by_index(given, index, True) != [given[i] for i in index]
    assert list(order_by_index(given, index, True)) == [given[i] for i in index]
Beispiel #30
0
def bar_tab(df_means, df_stdev, Time, Treatments, number_cmpds_run):
    colors = [
        "firebrick", "navy", 'green', 'orange', 'violet', 'lawngreen',
        'powderblue', 'lightgreen', 'yellow', 'olive', 'red', 'grey',
        'skyblue', 'indigo', 'slategray', 'hotpink', 'peachpuff', 'powderblue'
    ]
    Cmpd0 = df_means.columns[len(Treatments) + len(Time)]
    cmpd_options = cmpd_options_func(df_means,
                                     len(Treatments) + len(Time),
                                     number_cmpds_run)
    time_vals = df_means[Time[0]].drop_duplicates().tolist()
    time_vals = natsorted(time_vals)
    df_means = df_means.reindex(index=order_by_index(
        df_means.index, index_natsorted(df_means[Time[0]])))

    MEANs = df_means.groupby(Treatments)[Cmpd0].apply(list).to_dict()
    STDs = df_stdev.groupby(Treatments)[Cmpd0].apply(list).to_dict()
    keys = []
    u_keys = []
    l_keys = []
    results = {'time_vals': time_vals}

    for h in range(len(MEANs)):
        kk = list(MEANs.keys())[h][0]
        for tot in range(1, len(Treatments)):
            sk = list(MEANs.keys())[h][tot]
            if type(sk).__name__ != 'str':
                sk = str(sk)
            kk += '_' + sk
        keys.append(kk)
        u_keys.append('upper ' + kk)
        l_keys.append('lower ' + kk)
        mu = list(MEANs.values())[h]
        sd = list(STDs.values())[h]
        upper = [x + e for x, e in zip(mu, sd)]
        lower = [x - e for x, e in zip(mu, sd)]
        results.update({keys[h]: mu})
        results.update({u_keys[h]: upper})
        results.update({l_keys[h]: lower})
    source = ColumnDataSource(data=results)

    p = figure(x_range=time_vals,
               plot_height=1000,
               plot_width=1000,
               title=Cmpd0,
               toolbar_location="right")
    legend_it = []
    for hh in range(len(MEANs)):
        c = p.vbar(x=dodge('time_vals',
                           -0.4 + (.8 * hh / len(MEANs)),
                           range=p.x_range),
                   top=keys[hh],
                   width=(0.8 / len(MEANs)),
                   source=source,
                   color=colors[hh])
        p.add_layout(
            Whisker(source=source,
                    base=dodge('time_vals',
                               -0.4 + (.8 * hh / len(MEANs)),
                               range=p.x_range),
                    upper=u_keys[hh],
                    lower=l_keys[hh],
                    level="overlay"))
        legend_it.append((keys[hh], [c]))
    legend = Legend(items=legend_it, location=(0, 0))
    legend.click_policy = "mute"
    p.add_layout(legend, 'right')
    p.x_range.range_padding = 0.1
    p.xgrid.grid_line_color = None
    p.legend.orientation = "vertical"

    #This is where the widget is setup
    select = Select(title='Select your compound:',
                    value=Cmpd0,
                    options=cmpd_options)
    select_sd = Select(title="Standard Deviation:",
                       value='1',
                       options=['0', '1', '2', '3'])

    def update_title(attrname, old, new):
        p.title.text = select.value

    select.on_change('value', update_title)

    def update_data(attrname, old, new):
        cmpd = select.value
        std = int(select_sd.value)
        MEANs = df_means.groupby(Treatments)[cmpd].apply(list).to_dict()
        STDs = df_stdev.groupby(Treatments)[cmpd].apply(list).to_dict()

        results1 = {'time_vals': time_vals}
        for y in range(len(MEANs)):
            mu = list(MEANs.values())[y]
            sd = list(STDs.values())[y]
            upper = [x + std * e for x, e in zip(mu, sd)]
            lower = [x - std * e for x, e in zip(mu, sd)]
            results1.update({keys[y]: mu})
            results1.update({u_keys[y]: upper})
            results1.update({l_keys[y]: lower})
        source.data = results1

    for w in [select, select_sd]:
        w.on_change('value', update_data)

    # Set up layouts and add to document
    inputs = widgetbox(select, select_sd)
    layout = row(column(inputs), p, width=1000)
    tab = Panel(child=layout, title='Bar Charts')

    return tab
Beispiel #31
0
def control_timetable(timetable, header):
	print(Fore.YELLOW + '{:-^30}'.format(header))
	start_order_details = []
	for i in range(1, 4):
		start_order_details.append([j[1] for j in timetable if j[0] == i])

	# разбиваем интервалы по деталям, выйдет 4 списка, использ.
	# для подсчета ожидания
	order_by_details = []
	for i in start_order_details[0]:
		order_by_details.append([j for j in timetable if j[1] == i])

	# для подсчета простоев сорт. наше расписание по ГВМ
	timetable = ordered_timetable(timetable)

	# список порядка запуска деталей на ГВМ, убираем номер операции
	# и ГВМ со списка интервалов, для обсчета ожидания
	start_order_gvm = []
	order_by_details_norm = []
	for i in order_by_details:
		start_order_gvm.append([j[:1][0] for j in i])
		order_by_details_norm.append([j[2:] for j in i])

	# считаем послеоперационные простои ГВМ
	timetable = downtime(timetable)
	for i in range(3):
		index = natsort.index_natsorted(start_order_details[i])
		timetable[i] = natsort.order_by_index(timetable[i], index)

	print(Fore.WHITE + "\nDowntime: " + Fore.CYAN +
		'{}'.format(min([sum(i) for i in timetable])))
	for i in timetable:
		print(Fore.WHITE + '{!s:>18s}'.format(i))

	# считаем ожидание деталей перед обработкой
	print("\nWaiting:")
	order_by_details = waiting(order_by_details_norm)
	# упорядочиваем ожидание для каждой детали по ГВМ: 1 2 3
	index = natsort.index_natsorted(start_order_details[0])
	order_by_details = natsort.order_by_index(order_by_details, index)
	start_order_gvm = natsort.order_by_index(start_order_gvm, index)
	for i in range(4):
		index = natsort.index_natsorted(start_order_gvm[i])
		order_by_details[i] = natsort.order_by_index(order_by_details[i],
											index)

	for i in order_by_details:
		print(Fore.WHITE + '{!s:>16s}'.format(i))

	#считаем локальний резерв
	order_by_details = numpy.transpose(order_by_details)
	print('\nLocal resource:\n')
	for i in range(3):
		for j in range(4):
			first = timetable[i][j]
			try:
				second = order_by_details[i+1][j]
			except IndexError:
				second = float('inf')
			print("      L({0}, {1}) = min({2}, {3}) = {4}".format( \
				i+1, j+1, first, second, min(first, second)))
		print('\n')
	print(Style.RESET_ALL)
Beispiel #32
0
    def plot_ref(self, ref_dir, tag, other=False, strand=False, background=False, bin=False):
        print("Processing " + tag + " ....")
        refs = []
        refs_names = []
        if os.path.isdir(ref_dir):
            for f in os.listdir(ref_dir):
                if f.endswith(".bed"):
                    name = os.path.basename(f).replace(".bed", "")
                    bed = GenomicRegionSet(name)
                    bed.read(os.path.join(ref_dir, f))
                    if self.testing:
                        bed.sequences = bed.sequences[0:10]
                    # bed.merge()
                    refs.append(bed)
                    refs_names.append(name)
        elif os.path.isfile(ref_dir) and ref_dir.endswith(".bed"):
            name = os.path.basename(ref_dir).replace(".bed", "")
            bed = GenomicRegionSet(name)
            bed.read(ref_dir)
            if self.testing:
                bed.sequences = bed.sequences[0:10]
            # bed.merge()
            refs.append(bed)
            refs_names.append(name)
        else:
            print("*** Error: Not a valid directory: " + ref_dir)
            sys.exit(1)


        if background and len(refs) == 1:
            background = False
            self.background = self.background + [len(ref) for ref in refs]
        index = natsort.index_natsorted(refs_names)
        refs = natsort.order_by_index(refs, index)
        refs_names = natsort.order_by_index(refs_names, index)
        self.count_tableh = self.count_tableh + refs_names
        if other:
            refs_names.append("Else")
            self.count_tableh = self.count_tableh + [tag+"_else"]
        if strand:
            ref_plus = []
            ref_minus = []
            for ref in refs:
                ref_plus.append(ref.filter_strand(strand="+"))
                ref_minus.append(ref.filter_strand(strand="-"))
        if background:
            # refs_names.append("Background")
            if self.coverage:
                # background_counts = [len(ref) for ref in refs]
                background_cov = [ref.total_coverage() for ref in refs]
                background_prop = [float(100) * b / sum(background_cov) for b in background_cov]
                if other:
                    b = background_cov + [0]
                else:
                    b = background_cov
                self.background = self.background + b
            else:
                background_counts = [ len(ref) for ref in refs ]
                background_prop = [ float(100) * b/sum(background_counts) for b in background_counts]
                if other:
                    b = background_counts + [0]
                else:
                    b = background_counts
                self.background = self.background + b
        else:
            self.background = self.background + [0] * len(refs)
        # Counting through all references
        overlapping_counts = []
        for i, bed in enumerate(self.beds):
            c = []
            if strand:
                bed_plus = bed.filter_strand(strand="+")
                bed_minus = bed.filter_strand(strand="-")
                if other:
                    sum_ref_plus = GenomicRegionSet("ref_plus")
                    sum_ref_minus = GenomicRegionSet("ref_minus")
            else:
                if other:
                    sum_ref = GenomicRegionSet("ref")

            for j, ref in enumerate(refs):
                # print([bed.name, ref.name])
                if strand:
                    if self.coverage:
                        cc = bed_plus.intersect(ref_plus[j]).total_coverage() + \
                             bed_minus.intersect(ref_minus[j]).total_coverage()
                    else:
                        cc = bed_plus.count_by_regionset(ref_plus[j]) + bed_minus.count_by_regionset(ref_minus[j])
                    if other:
                        sum_ref_plus.combine(ref_plus[j])
                        sum_ref_minus.combine(ref_minus[j])
                else:
                    if self.coverage:
                        cc = bed.intersect(ref).total_coverage()
                    else:
                        cc = bed.count_by_regionset(ref)
                    if other:
                        sum_ref.combine(ref)
                c.append(cc)
                self.count_table[bed.name][ref.name] = cc

            if other:
                if self.coverage:
                    c.append(bed.total_coverage() - sum(c))
                else:
                    if strand:
                        sum_ref_plus.merge()
                        sum_ref_minus.merge()

                        remain_regions_p = bed_plus.subtract(sum_ref_plus, whole_region=True)
                        remain_regions_m = bed_minus.subtract(sum_ref_minus, whole_region=True)
                        remain_regions = remain_regions_p.combine(remain_regions_m, output=True)
                    else:
                        sum_ref.merge()
                        remain_regions = bed.subtract(sum_ref, whole_region=True)
                    c.append(len(remain_regions))
                for j, ref in enumerate(refs):
                    self.count_table[bed.name][tag+"_else"] = c[-1]
            overlapping_counts.append(c)
        # Tables
        for i, bed in enumerate(self.beds):
            for j, ref in enumerate(refs):
                names = bed.map_names(ref, strand=strand, convert_nt=True)
                self.table_h[self.bednames[i]].append(refs_names[j])
                self.tables[self.bednames[i]].append(names)
        # Generate Figure
        if other:
            color_list = plt.cm.Set1(numpy.linspace(0, 1, len(refs_names))).tolist()
        else:
            color_list = plt.cm.Set1(numpy.linspace(0, 0.95, len(refs_names))).tolist()

        for i in range(len(self.beds) + 1):
            # Plot
            try:
                ax = self.fig_axs[i, self.ind_col[tag]]
            except:
                try:
                    ax = self.fig_axs[i]
                except:
                    ax = self.fig_axs
            if i == 0:

                proportion = []
                for counts in overlapping_counts:
                    ss = sum(counts)
                    if ss > 0:
                        proportion.append([x / ss * 100 for x in counts])
                    else:
                        proportion.append([0 for x in counts])
                if background:
                    if other:
                        proportion.append(background_prop + [0])
                        len_ref = len(refs) + 1
                    else:
                        proportion.append(background_prop)
                        len_ref = len(refs)
                    bottom = [0] * (len(self.bednames) + 1)
                    xlabels = self.bednames + ["Background"]
                else:
                    len_ref = len(refs)
                    bottom = [0] * len(self.bednames)
                    xlabels = self.bednames
                ptable = []
                # print(proportion)
                # print(len_ref)
                for j in range(len_ref):
                    ptable.append([x[j] for x in proportion])
                width = 0.6
                for j, y in enumerate(ptable):
                    ax.bar(range(len(bottom)), y, width=width, bottom=bottom, color=color_list[j],
                           edgecolor="none", align='center')
                    bottom = [x + y for x, y in zip(bottom, y)]
                ax.set_title(tag)
                ax.yaxis.tick_left()
                ax.set_xticks(range(len(xlabels)))
                ax.set_xticklabels(xlabels, fontsize=7, rotation=20, ha="right")
                ax.set_ylabel("Percentage %")
                # ax.tick_params(axis='x', which='both', top='off', bottom='off', labelbottom=True)
                ax.set_ylim([0, 100])
                ax.set_xlim([-0.5, len(xlabels) - 0.5])
                plt.tight_layout()

            elif i > 0:
                x = [x for x in range(len(overlapping_counts[i - 1]))]
                ax.bar(x, overlapping_counts[i - 1],
                       color=color_list, linewidth=0, edgecolor="none", align='center')
                ax.set_title(self.bednames[i - 1])
                # ax.set_ylabel("Number")
                ax.set_xticks([x for x in range(len(overlapping_counts[i - 1]))])
                ax.set_xticklabels(refs_names, fontsize=7, rotation=20, ha="right")
                ax.set_xlim([-0.5, len(overlapping_counts[i - 1]) - 0.5])
                plt.tight_layout()

        ax.set_xlabel(tag)
Beispiel #33
0
def plotSiteDiffSel(names, diffselfiles, plotfile, 
        diffseltype, maxcol=2, white_bg=False):
    """Plot site diffsel or fracsurvive along sequence.

    Despite the function name, this function can be used to
    plot either differential selection or fraction surviving.

    Args:
        `names` (list or series)
            Names of samples for which we plot statistics.
        `diffselfiles` (list or series)
            ``*sitediffsel.csv`` files from ``dms2_diffsel`` or
            ``*sitefracsurvive.csv`` files from ``dms2_fracsurvive``.
        `plotfile` (str)
            Name of created PDF plot file.
        `diffseltype` (str)
            Type of diffsel or fracsurvive to plot:
                - `positive`: positive sitediffsel
                - `total`: positive and negative sitediffsel
                - `max`: maximum mutdiffsel
                - `minmax`: minimum and maximum mutdiffsel
                - `avgfracsurvive`: total site fracsurvive
                - `maxfracsurvive`: max mutfracsurvive at site
        `maxcol` (int)
            Number of columns in faceted plot.
        `white_bg` (bool)
            Plots will have a white background with limited other formatting.

    """
    assert len(names) == len(diffselfiles) == len(set(names)) > 0
    assert os.path.splitext(plotfile)[1].lower() == '.pdf'

    diffsels = [pandas.read_csv(f).assign(name=name) for (name, f) 
            in zip(names, diffselfiles)]
    assert all([set(diffsels[0]['site']) == set(df['site']) for df in 
            diffsels]), "diffselfiles not all for same sites"
    diffsel = pandas.concat(diffsels, ignore_index=True)

    ylabel = 'differential selection'
    if diffseltype == 'positive':
        rename = {'positive_diffsel':'above'}
    elif diffseltype == 'total':
        rename = {'positive_diffsel':'above',
                  'negative_diffsel':'below'}
    elif diffseltype == 'max':
        rename = {'max_diffsel':'above'}
    elif diffseltype == 'minmax':
        rename = {'max_diffsel':'above',
                  'min_diffsel':'below'}
    elif diffseltype in ['avgfracsurvive', 'maxfracsurvive']:
        ylabel = 'fraction surviving'
        rename = {diffseltype:'above'}
    else:
        raise ValueError("invalid diffseltype {0}".format(diffseltype))
    diffsel = (diffsel.rename(columns=rename)
                      .melt(id_vars=['site', 'name'], 
                            value_vars=list(rename.values()),
                            value_name='diffsel',
                            var_name='direction')
                      )


    # natural sort by site: https://stackoverflow.com/a/29582718
    diffsel = diffsel.reindex(index=natsort.order_by_index(
            diffsel.index, natsort.index_natsorted(diffsel.site,
            signed=True)))
    # now some manipulations to make site str while siteindex is int
    diffsel['site'] = diffsel['site'].apply(str)
    diffsel['siteindex'] = pandas.Categorical(diffsel['site'],
            diffsel['site'].unique()).codes
    
    ncol = min(maxcol, len(names))
    nrow = math.ceil(len(names) / float(ncol))

    # make name a category to preserve order
    diffsel['name'] = diffsel['name'].astype('category', 
            categories=names)

    (xbreaks, xlabels) = breaksAndLabels(diffsel['siteindex'].unique(), 
            diffsel['site'].unique(), n=6)
    if white_bg:
        p = (ggplot(diffsel, aes(x='siteindex', y='diffsel',
                    color='direction', fill='direction'))
             + geom_step(size=0.3)
             + xlab('site')
             + ylab(ylabel)
             + scale_x_continuous(breaks=xbreaks, labels=xlabels)
             + scale_color_manual(COLOR_BLIND_PALETTE)
             + scale_fill_manual(COLOR_BLIND_PALETTE)
             + guides(color=False)
             + theme(panel_background=element_rect(fill='white'),
                     axis_line_x=element_line(color='black'),
                     axis_line_y=element_line(color='black'),
                     panel_grid=element_blank(),
                     panel_border=element_blank(),
                     strip_background=element_blank()
                     )
             )
    else:
        p = (ggplot(diffsel, aes(x='siteindex', y='diffsel', color='direction'))
             + geom_step(size=0.4)
             + xlab('site')
             + ylab(ylabel)
             + scale_x_continuous(breaks=xbreaks, labels=xlabels)
             + scale_color_manual(COLOR_BLIND_PALETTE)
             + guides(color=False)
             )
    if not ((len(names) == 1) and ((not names[0]) or names[0].isspace())):
        p += facet_wrap('~name', ncol=ncol)
    p += theme(figure_size=(4.6 * (0.3 + ncol), 1.9 * (0.2 + nrow)))
    p.save(plotfile, verbose=False)
    plt.close()
Beispiel #34
0
def main():
    parser = parse_args_coolpuppy()
    args = parser.parse_args()

    if args.post_mortem:

        def _excepthook(exc_type, value, tb):
            traceback.print_exception(exc_type, value, tb)
            print()
            pdb.pm()

        sys.excepthook = _excepthook

    logging.basicConfig(format="%(message)s", level=getattr(logging, args.logLevel))

    logging.info(args)

    if args.seed is not None:
        np.random.seed(args.seed)

    if args.n_proc == 0:
        nproc = -1
    else:
        nproc = args.n_proc

    c = cooler.Cooler(args.coolfile)

    if not os.path.isfile(args.baselist) and args.baselist != "-":
        raise FileExistsError("Loop(base) coordinate file doesn't exist")

    if args.unbalanced:
        balance = False
    else:
        balance = args.weight_name

    coolname = os.path.splitext(os.path.basename(c.filename))[0]
    if args.baselist != "-":
        bedname = os.path.splitext(os.path.basename(args.baselist))[0]
    else:
        bedname = "stdin"
        args.baselist = sys.stdin
    if args.bed2 is not None:
        bedname += "_vs_" + os.path.splitext(os.path.basename(args.bed2))[0]

    if args.nshifts > 0:
        control = True
    else:
        control = False

    if args.expected is not None:
        if args.nshifts > 0:
            logging.warning("With specified expected will not use controls")
            control = False
        if not os.path.isfile(args.expected):
            raise FileExistsError("Expected file doesn't exist")
        expected = pd.read_csv(args.expected, sep="\t", header=0)
    else:
        expected = False
    if args.mindist is None:
        mindist = "auto"
    else:
        mindist = args.mindist

    if args.maxdist is None:
        maxdist = np.inf
    else:
        maxdist = args.maxdist

    if args.minsize is None:
        minsize = 0
    else:
        minsize = args.minsize

    if args.maxsize is None:
        maxsize = np.inf
    else:
        maxsize = args.maxsize

    if args.incl_chrs == "all":
        incl_chrs = np.array(c.chromnames).astype(str)
    else:
        incl_chrs = args.incl_chrs.split(",")

    if args.by_window and args.rescale:
        raise NotImplementedError(
            """Rescaling with by-window pileups is not
                                  supported"""
        )

    if args.rescale and args.rescale_size % 2 == 0:
        raise ValueError("Please provide an odd rescale_size")

    if args.anchor is not None:
        if "_" in args.anchor:
            anchor, anchor_name = args.anchor.split("_")
            anchor = cooler.util.parse_region_string(anchor)
        else:
            anchor = cooler.util.parse_region_string(args.anchor)
            anchor_name = args.anchor
    else:
        anchor = None

    if anchor:
        fchroms = [anchor[0]]
    else:
        chroms = np.array(c.chromnames).astype(str)
        fchroms = []
        for chrom in chroms:
            if chrom not in args.excl_chrs.split(",") and chrom in incl_chrs:
                fchroms.append(chrom)
    if args.anchor is not None:
        anchor = cooler.util.parse_region_string(args.anchor)

    CC = CoordCreator(
        baselist=args.baselist,
        resolution=c.binsize,
        bed2=args.bed2,
        bed2_ordered=args.bed2_ordered,
        anchor=anchor,
        pad=args.pad * 1000,
        chroms=fchroms,
        minshift=args.minshift,
        maxshift=args.maxshift,
        nshifts=args.nshifts,
        minsize=minsize,
        maxsize=maxsize,
        mindist=mindist,
        maxdist=maxdist,
        local=args.local,
        subset=args.subset,
        seed=args.seed,
    )
    CC.process()

    PU = PileUpper(
        clr=c,
        CC=CC,
        balance=balance,
        expected=expected,
        control=control,
        coverage_norm=args.coverage_norm,
        rescale=args.rescale,
        rescale_pad=args.rescale_pad,
        rescale_size=args.rescale_size,
    )

    if args.outdir == ".":
        args.outdir = os.getcwd()

    if args.outname == "auto":
        outname = f"{coolname}-{c.binsize / 1000}K_over_{bedname}"
        if args.nshifts > 0 and args.expected is None:
            outname += f"_{args.nshifts}-shifts"
        if args.expected is not None:
            outname += "_expected"
        if args.nshifts <= 0 and args.expected is None:
            outname += "_noNorm"
        if anchor:
            outname += f"_from_{anchor_name}"
        if args.local:
            outname += "_local"
            if minsize > 0 or maxsize < np.inf:
                outname += f"_len_{minsize}-{maxsize}"
        elif args.mindist is not None or args.maxdist is not None:
            outname += f"_dist_{mindist}-{maxdist}"
        if args.rescale:
            outname += "_rescaled"
        if args.unbalanced:
            outname += "_unbalanced"
        if args.coverage_norm:
            outname += "_covnorm"
        if args.subset > 0:
            outname += f"_subset-{args.subset}"
        if args.by_window:
            outname = f"Enrichment_{outname}.txt"
        else:
            outname += ".np.txt"
    else:
        outname = args.outname

    if args.by_window:
        if CC.kind != "bed":
            raise ValueError("Can't make by-window pileups without making combinations")
        if args.local:
            raise ValueError("Can't make local by-window pileups")
        if anchor:
            raise ValueError("Can't make by-window combinations with an anchor")
        #        if args.coverage_norm:
        #            raise NotImplementedError("""Can't make by-window combinations with
        #                                      coverage normalization - please use
        #                                      balanced data instead""")
        finloops = PU.pileupsByWindowWithControl(nproc=nproc)

        p = Pool(nproc)
        data = p.map(prepare_single, finloops.items())
        p.close()
        data = pd.DataFrame(
            data,
            columns=[
                "chr",
                "start",
                "end",
                "N",
                "Enrichment1",
                "Enrichment3",
                "CV3",
                "CV5",
            ],
        )
        data = data.reindex(
            index=order_by_index(
                data.index, index_natsorted(zip(data["chr"], data["start"]))
            )
        )
        try:
            data.to_csv(os.path.join(args.outdir, outname), sep="\t", index=False)
        except FileNotFoundError:
            os.mkdir(args.outdir)
            data.to_csv(os.path.join(args.outdir, outname), sep="\t", index=False)
        finally:
            logging.info(
                f"Saved enrichment table to {os.path.join(args.outdir, outname)}"
            )

        if args.save_all:
            outdict = {
                "%s:%s-%s" % key: (val[0], val[1].tolist())
                for key, val in finloops.items()
            }
            import json

            json_path = (
                os.path.join(args.outdir, os.path.splitext(outname)[0]) + ".json"
            )
            with open(json_path, "w") as fp:
                json.dump(outdict, fp)  # , sort_keys=True, indent=4)
                logging.info(f"Saved individual pileups to {json_path}")
    else:
        pup = PU.pileupsWithControl(nproc)
        try:
            save_array_with_header(pup, vars(args), os.path.join(args.outdir, outname))
        except FileNotFoundError:
            try:
                os.mkdir(args.outdir)
            except FileExistsError:
                pass
            save_array_with_header(pup, vars(args), os.path.join(args.outdir, outname))
        finally:
            logging.info(f"Saved output to {os.path.join(args.outdir, outname)}")
Beispiel #35
0
def heatmap_tab(df_means, df_stdev, Time, Treatments, number_cmpds_run):

    Cmpd0 = df_means.columns[len(Treatments) + len(Time)]
    cmpd_options = cmpd_options_func(df_means,
                                     len(Treatments) + len(Time),
                                     number_cmpds_run)
    df_means = df_means.reindex(index=order_by_index(
        df_means.index, index_natsorted(df_means[Time[0]])))

    if len(Treatments) == 4:
        df_means[Treatments[0]] = df_means[Treatments[0]].astype('str')
        df_means[Treatments[1]] = df_means[Treatments[1]].astype('str')
        df_means[Treatments[2]] = df_means[Treatments[2]].astype('str')
        df_means[Treatments[3]] = df_means[Treatments[3]].astype('str')
    elif len(Treatments) == 3:
        df_means[Treatments[0]] = df_means[Treatments[0]].astype('str')
        df_means[Treatments[1]] = df_means[Treatments[1]].astype('str')
        df_means[Treatments[2]] = df_means[Treatments[2]].astype('str')
    elif len(Treatments) == 2:
        df_means[Treatments[0]] = df_means[Treatments[0]].astype('str')
        df_means[Treatments[1]] = df_means[Treatments[1]].astype('str')
    elif len(Treatments) == 1:
        df_means[Treatments[0]] = df_means[Treatments[0]].astype('str')

    df_means['Treatment'] = df_means[Time[0]].str.cat(df_means[Treatments],
                                                      sep=' - ')
    df_stdev['Treatment'] = df_means[Time[0]].str.cat(df_means[Treatments],
                                                      sep=' - ')
    treatments = list(df_means['Treatment'])

    df_m = df_means[['Treatment', Cmpd0]].copy()
    df_m2 = df_m.set_index(df_m[df_m.columns[0]].astype(str))
    df_m2.drop(df_m.columns[0], axis=1, inplace=True)
    df_s = df_stdev[['Treatment', Cmpd0]].copy()
    df_s2 = df_s.set_index(df_s[df_s.columns[0]].astype(str))
    df_s2.drop(df_s2.columns[0], axis=1, inplace=True)

    df_HM = pd.DataFrame(index=df_means['Treatment'],
                         columns=df_means['Treatment'])
    df_HM.index.name = 'Treatment1'
    df_HM.columns.name = 'Treatment2'

    for i_t in treatments:
        for j_t in treatments:
            m1 = df_m2.loc[i_t, Cmpd0]
            m2 = df_m2.loc[j_t, Cmpd0]
            sd1 = df_s2.loc[i_t, Cmpd0]
            sd2 = df_s2.loc[j_t, Cmpd0]
            a1 = [m1 - sd1, m1 + sd1]
            b1 = [m2 - sd2, m2 + sd2]
            a2 = [m1 - 2 * sd1, m1 + 2 * sd1]
            b2 = [m2 - 2 * sd2, m2 + 2 * sd2]
            a3 = [m1 - 3 * sd1, m1 + 3 * sd1]
            b3 = [m2 - 3 * sd2, m2 + 3 * sd2]
            if getOverlap(a1, b1) > 0:
                marker = 'green'
            elif getOverlap(a2, b2) > 0:
                marker = 'yellow'
            elif getOverlap(a3, b3) > 0:
                marker = 'orange'
            else:
                marker = 'red'
            df_HM.loc[i_t, j_t] = marker
    hm_colors = df_HM.values.reshape(-1).tolist()
    t2 = treatments * len(treatments)
    t1 = []
    for tt in treatments:
        for i in range(len(treatments)):
            t1.append(tt)
    source = ColumnDataSource({
        'treat1': t1,
        'treat2': t2,
        'colors': hm_colors
    })

    p = figure(title="Categorical Heatmap",
               x_range=treatments,
               y_range=treatments,
               plot_height=1000,
               plot_width=1000)
    p.rect(x='treat1',
           y='treat2',
           color='colors',
           width=1,
           height=1,
           line_color='black',
           line_width=2,
           source=source)
    p.xaxis.major_label_orientation = np.pi / 2

    select = Select(title='Select your compound:',
                    value=Cmpd0,
                    options=cmpd_options)

    def update_data(attrname, old, new):
        cmpd = select.value
        df_m = df_means[['Treatment', cmpd]].copy()
        df_m2 = df_m.set_index(df_m[df_m.columns[0]].astype(str))
        df_m2.drop(df_m.columns[0], axis=1, inplace=True)
        df_s = df_stdev[['Treatment', cmpd]].copy()
        df_s2 = df_s.set_index(df_stdev[df_s.columns[0]].astype(str))
        df_s2.drop(df_s2.columns[0], axis=1, inplace=True)
        df_HM = pd.DataFrame(index=df_means['Treatment'],
                             columns=df_means['Treatment'])
        df_HM.index.name = 'Treatment1'
        df_HM.columns.name = 'Treatment2'
        for i_t in treatments:
            for j_t in treatments:
                m1 = df_m2.loc[i_t, cmpd]
                m2 = df_m2.loc[j_t, cmpd]
                sd1 = df_s2.loc[i_t, cmpd]
                sd2 = df_s2.loc[j_t, cmpd]
                a1 = [m1 - sd1, m1 + sd1]
                b1 = [m2 - sd2, m2 + sd2]
                a2 = [m1 - 2 * sd1, m1 + 2 * sd1]
                b2 = [m2 - 2 * sd2, m2 + 2 * sd2]
                a3 = [m1 - 3 * sd1, m1 + 3 * sd1]
                b3 = [m2 - 3 * sd2, m2 + 3 * sd2]
                if getOverlap(a1, b1) > 0:
                    marker = 'green'
                elif getOverlap(a2, b2) > 0:
                    marker = 'yellow'
                elif getOverlap(a3, b3) > 0:
                    marker = 'orange'
                else:
                    marker = 'red'
                df_HM.loc[i_t, j_t] = marker
        hm_colors = df_HM.values.reshape(-1).tolist()
        t2 = treatments * len(treatments)
        t1 = []
        for tt in treatments:
            for i in range(len(treatments)):
                t1.append(tt)
        results1 = {'treat1': t1, 'treat2': t2, 'colors': hm_colors}
        source.data = results1

    for w in [select]:
        w.on_change('value', update_data)

# Create a row layout
    inputs = widgetbox(select)
    layout = row(inputs, p, width=1500)
    tab = Panel(child=layout, title='Heatmap')

    return tab
Beispiel #36
0
    def __init__(self, input_path, organism, args):
        self.testing = args.test
        if os.path.isdir(input_path):
            self.beds = []
            self.bednames = []
            for dirpath, dnames, fnames in walklevel(input_path, level=0):
                for f in fnames:
                    if f.endswith(".bed"):
                        name = os.path.basename(f).replace(".bed", "")
                        bed = GenomicRegionSet(name)
                        bed.read(os.path.join(dirpath, f))
                        if args.test:
                            bed.sequences = bed.sequences[0:10]
                        bed.sort()
                        self.beds.append(bed)
                        self.bednames.append(name)

            index = natsort.index_natsorted(self.bednames)
            self.beds = natsort.order_by_index(self.beds, index)
            self.bednames = natsort.order_by_index(self.bednames, index)

        elif os.path.isfile(input_path):
            if input_path.endswith(".bed"):
                name = os.path.basename(input_path).replace(".bed", "")
                bed = GenomicRegionSet(name)
                bed.read(input_path)
                if args.test:
                    bed.sequences = bed.sequences[0:10]
                bed.sort()
                self.beds = [bed]
                self.bednames = [name]
            else:
                self.EM = ExperimentalMatrix()
                self.EM.read(input)
                self.beds = self.EM.get_regionsets()
                self.bednames = self.EM.get_regionsnames()
        else:
            print("***Please make sure that there are BED files in " + input_path)
            sys.exit(1)

        self.organism = organism
        self.chromosomes = GenomicRegionSet(organism)
        self.chromosomes.get_genome_data(organism=organism, chrom_X=True)
        genome = GenomeData(organism=organism)
        self.fasta_dir = genome.get_genome()
        self.stats = OrderedDict()
        self.ind_col = {}
        size_panel = 6
        rows = len(self.beds)
        cols = 2
        if args.biotype:
            self.ind_col["Biotype"] = cols
            cols += 1
        if args.repeats:
            self.ind_col["Repeats"] = cols
            cols += 1
        if args.genposi:
            self.ind_col["Genetic position"] = cols
            cols += 1
        if args.labels:
            for label in args.labels:
                self.ind_col[label] = cols
                cols += 1
        self.fig_f, self.fig_axs = plt.subplots(rows + 1, cols, dpi=300, figsize=(cols * size_panel, rows * size_panel))
        self.table_h = {}
        self.tables = {}
        self.count_table = {}
        self.count_tableh = []
        for i, bed in enumerate(self.beds):
            self.table_h[self.bednames[i]] = [self.bednames[i]]
            self.tables[self.bednames[i]] = []
            self.tables[self.bednames[i]].append([r.toString() for r in bed])
            self.table_h[self.bednames[i]].append("strand")
            self.tables[self.bednames[i]].append([r.orientation if r.orientation else "." for r in bed])
            self.count_table[bed.name] = {}
        if args.coverage:
            self.coverage = True
        else:
            self.coverage = False
        self.background = []
Beispiel #37
0
    def plot_ref(self, ref_dir, tag, other=False, strand=False, background=False, bin=False):
        print("Processing " + tag + " ....")
        refs = []
        refs_names = []
        if os.path.isdir(ref_dir):
            for f in os.listdir(ref_dir):
                if f.endswith(".bed"):
                    name = os.path.basename(f).replace(".bed", "")
                    bed = GenomicRegionSet(name)
                    bed.read(os.path.join(ref_dir, f))
                    if self.testing:
                        bed.sequences = bed.sequences[0:10]
                    # bed.merge()
                    refs.append(bed)
                    refs_names.append(name)
        elif os.path.isfile(ref_dir) and ref_dir.endswith(".bed"):
            name = os.path.basename(ref_dir).replace(".bed", "")
            bed = GenomicRegionSet(name)
            bed.read(ref_dir)
            if self.testing:
                bed.sequences = bed.sequences[0:10]
            # bed.merge()
            refs.append(bed)
            refs_names.append(name)
        else:
            print("*** Error: Not a valid directory: " + ref_dir)
            sys.exit(1)


        if background and len(refs) == 1:
            background = False
            self.background = self.background + [len(ref) for ref in refs]
        index = natsort.index_natsorted(refs_names)
        refs = natsort.order_by_index(refs, index)
        refs_names = natsort.order_by_index(refs_names, index)
        self.count_tableh = self.count_tableh + refs_names
        if other:
            refs_names.append("Else")
            self.count_tableh = self.count_tableh + [tag+"_else"]
        if strand:
            ref_plus = []
            ref_minus = []
            for ref in refs:
                ref_plus.append(ref.filter_strand(strand="+"))
                ref_minus.append(ref.filter_strand(strand="-"))
        if background:
            # refs_names.append("Background")
            if self.coverage:
                # background_counts = [len(ref) for ref in refs]
                background_cov = [ref.total_coverage() for ref in refs]
                background_prop = [float(100) * b / sum(background_cov) for b in background_cov]
                if other:
                    b = background_cov + [0]
                else:
                    b = background_cov
                self.background = self.background + b
            else:
                background_counts = [ len(ref) for ref in refs ]
                background_prop = [ float(100) * b/sum(background_counts) for b in background_counts]
                if other:
                    b = background_counts + [0]
                else:
                    b = background_counts
                self.background = self.background + b
        else:
            self.background = self.background + [0] * len(refs)
        # Counting through all references
        overlapping_counts = []
        for i, bed in enumerate(self.beds):
            c = []
            if strand:
                bed_plus = bed.filter_strand(strand="+")
                bed_minus = bed.filter_strand(strand="-")
                if other:
                    sum_ref_plus = GenomicRegionSet("ref_plus")
                    sum_ref_minus = GenomicRegionSet("ref_minus")
            else:
                if other:
                    sum_ref = GenomicRegionSet("ref")

            for j, ref in enumerate(refs):
                # print([bed.name, ref.name])
                if strand:
                    if self.coverage:
                        cc = bed_plus.intersect(ref_plus[j]).total_coverage() + \
                             bed_minus.intersect(ref_minus[j]).total_coverage()
                    else:
                        cc = bed_plus.count_by_regionset(ref_plus[j]) + bed_minus.count_by_regionset(ref_minus[j])
                    if other:
                        sum_ref_plus.combine(ref_plus[j])
                        sum_ref_minus.combine(ref_minus[j])
                else:
                    if self.coverage:
                        cc = bed.intersect(ref).total_coverage()
                    else:
                        cc = bed.count_by_regionset(ref)
                    if other:
                        sum_ref.combine(ref)
                c.append(cc)
                self.count_table[bed.name][ref.name] = cc

            if other:
                if self.coverage:
                    c.append(bed.total_coverage() - sum(c))
                else:
                    if strand:
                        sum_ref_plus.merge()
                        sum_ref_minus.merge()

                        remain_regions_p = bed_plus.subtract(sum_ref_plus, whole_region=True)
                        remain_regions_m = bed_minus.subtract(sum_ref_minus, whole_region=True)
                        remain_regions = remain_regions_p.combine(remain_regions_m, output=True)
                    else:
                        sum_ref.merge()
                        remain_regions = bed.subtract(sum_ref, whole_region=True)
                    c.append(len(remain_regions))
                for j, ref in enumerate(refs):
                    self.count_table[bed.name][tag+"_else"] = c[-1]
            overlapping_counts.append(c)
        # Tables
        for i, bed in enumerate(self.beds):
            for j, ref in enumerate(refs):
                names = bed.map_names(ref, strand=strand, convert_nt=True)
                self.table_h[self.bednames[i]].append(refs_names[j])
                self.tables[self.bednames[i]].append(names)
        # Generate Figure
        if other:
            color_list = plt.cm.Set1(numpy.linspace(0, 1, len(refs_names))).tolist()
        else:
            color_list = plt.cm.Set1(numpy.linspace(0, 0.95, len(refs_names))).tolist()

        for i in range(len(self.beds) + 1):
            # Plot
            try:
                ax = self.fig_axs[i, self.ind_col[tag]]
            except:
                try:
                    ax = self.fig_axs[i]
                except:
                    ax = self.fig_axs
            if i == 0:

                proportion = []
                for counts in overlapping_counts:
                    ss = sum(counts)
                    if ss > 0:
                        proportion.append([x / ss * 100 for x in counts])
                    else:
                        proportion.append([0 for x in counts])
                if background:
                    if other:
                        proportion.append(background_prop + [0])
                        len_ref = len(refs) + 1
                    else:
                        proportion.append(background_prop)
                        len_ref = len(refs)
                    bottom = [0] * (len(self.bednames) + 1)
                    xlabels = self.bednames + ["Background"]
                else:
                    len_ref = len(refs)
                    bottom = [0] * len(self.bednames)
                    xlabels = self.bednames
                ptable = []
                # print(proportion)
                # print(len_ref)
                for j in range(len_ref):
                    ptable.append([x[j] for x in proportion])
                width = 0.6
                for j, y in enumerate(ptable):
                    ax.bar(range(len(bottom)), y, width=width, bottom=bottom, color=color_list[j],
                           edgecolor="none", align='center')
                    bottom = [x + y for x, y in zip(bottom, y)]
                ax.set_title(tag)
                ax.yaxis.tick_left()
                ax.set_xticks(range(len(xlabels)))
                ax.set_xticklabels(xlabels, fontsize=7, rotation=20, ha="right")
                ax.set_ylabel("Percentage %")
                # ax.tick_params(axis='x', which='both', top='off', bottom='off', labelbottom=True)
                ax.set_ylim([0, 100])
                ax.set_xlim([-0.5, len(xlabels) - 0.5])
                plt.tight_layout()

            elif i > 0:
                x = [x for x in range(len(overlapping_counts[i - 1]))]
                ax.bar(x, overlapping_counts[i - 1],
                       color=color_list, linewidth=0, edgecolor="none", align='center')
                ax.set_title(self.bednames[i - 1])
                # ax.set_ylabel("Number")
                ax.set_xticks([x for x in range(len(overlapping_counts[i - 1]))])
                ax.set_xticklabels(refs_names, fontsize=7, rotation=20, ha="right")
                ax.set_xlim([-0.5, len(overlapping_counts[i - 1]) - 0.5])
                plt.tight_layout()

        ax.set_xlabel(tag)
Beispiel #38
0
def test_order_by_index_sorts_list_according_to_order_of_integer_list():
    a = ['num3', 'num5', 'num2']
    index = [2, 0, 1]
    assert order_by_index(a, index) == ['num2', 'num3', 'num5']
    assert order_by_index(a, index) == [a[i] for i in index]