Esempio n. 1
0
def compartment_to_bedgraph(cool, bedgraph):
    """
    Computes eigen vectors from a cooler file, correlate and ranks them according
    to gene density per bin in hg19 and writes a bedgraph file.

    Parameters
    ----------
    cool : str
        Path to the input cooler file. Can also be the URI in an mcool file. For
        example: ex.mcool::/resolutions/640000
    bedgraph : str
        Path to the output bedgraph file with compartment info.
    """
    # Retrieve cooler file and extract bin table
    c = cooler.Cooler(cool)
    bins = c.bins()[:]
    # Fetch and compute gene coverage per bin and make a new bins table with
    # gene_count column
    genecov = bioframe.tools.frac_gene_coverage(bins, "hg19")
    # Compute 3 eigen vectors and rank + correlate with gene density
    cis_vals, cis_eigs = eigdecomp.cooler_cis_eig(
        c, bins=genecov, n_eigs=3, phasing_track_col="gene_count")

    bg = cis_eigs.loc[:, ["chrom", "start", "end", "E1"]]
    bg.to_csv(bedgraph, sep="\t", header=None, index=False, na_rep="nan")
def get_eigs(df, genecov_dict=None, n=3):
    '''
    
        df: dataframe of multiple coolers. More info in preprocessing.py
        
        genecove_dict: dictionary of genenomic coverage for each genomic assembly,
                        it's recommended to pass this as a parameter since this it takes a lot of time and memory to call the function.
                        
        n: amount of eigenvectors to be passed.
        
        
        Gets the eigenvectors of multiple celllines into two lists vals and tracks, 
        vals contains a list of dataframes of the eigenvalues,
        tracks contains a list of dataframes containing eigenvectors from cooltools
        
        
        By default, the eigenvectors will be taken from a balanced matrix,
        ignoring the first 4 diagonals and the percent being clipped will be after the 99th percentile.
        Change this manually if needed.
    '''

    if genecov_dict == None:
        genecov_dict = fileprocessing.get_genecov(df)

    vals, tracks = [], []
    for i in range(len(df)):
        c = df.cooler.iloc[i]
        bins = c.bins()[:]
        bins['gene_count'] = genecov_dict[df.assembly.iloc[i]]
        regions = [(chrom, 0, c.chromsizes[chrom]) for chrom in c.chromnames]
        cis_vals, cis_eigs = eigdecomp.cooler_cis_eig(
            c,
            bins,
            regions=regions,
            n_eigs=n,
            balance=True,
            phasing_track_col='gene_count',
            ignore_diags=4,
            clip_percentile=99)
        tracks.append(cis_eigs)
        vals.append(cis_vals)
    return vals, tracks
Esempio n. 3
0
def compartment_to_bedgraph(cool, track_file, out_bedgraph):
    """
    Computes eigen vectors from a cooler file, correlate and ranks them according
    to an input track and writes a bedgraph file.
    Parameters
    ----------
    cool : str
        Path to the input cooler file. Can also be the URI in an mcool file. For
        example: ex.mcool::/resolutions/640000
    track_file : str
        Path to the bedgraph file with a track that should be positively correlated
        with compartment A. 4th column is used as the signal.
    bedgraph : str
        Path to the output bedgraph file with compartment info.
    """
    # Retrieve cooler file and extract bin table
    c = cooler.Cooler(cool)
    bins = c.bins()[:]
    track = pd.read_csv(track_file,
                        sep="\t",
                        names=["chrom", "start", "end", "percentage"])
    # genecov = bioframe.tools.frac_gene_coverage(bins, "mm10")

    # Compute 3 eigen vectors and rank + correlate with gene density
    cis_vals, cis_eigs = eigdecomp.cooler_cis_eig(
        c,
        bins=track,
        n_eigs=3,
        phasing_track_col=track.columns[3],
        sort_metric="pearsonr",
    )

    bg = cis_eigs.loc[:, ["chrom", "start", "end", "E1"]]
    # cooler built-in function already calculate pearsonr correlation to find PC ~ AB and delete diagonal

    bg.to_csv(out_bedgraph, sep="\t", header=None, index=False, na_rep="nan")
Esempio n. 4
0
def batch_process(filepaths, genome, resolution, balance_col='weight', do_trans=False, savepath=None, bigwig=False):
    if isinstance(filepaths, str):
        filepaths = [filepaths]
    assert len(filepaths) != 0, 'Empty file list.'
  
    filepaths = drop_unbalanced(filepaths, resolution)
    n = len(filepaths)
    
    cis_regions = DNA_info.get_chromosome_arms(genome)
    genes = DNA_info.gene_content(genome, resolution, gc=True)
        
    lam_list = []
    vector_list = []
    discrep_list = []
    for i, filepath in enumerate(filepaths):
        print(str(i+1)+" of "+str(n)+": "+filepath)
        cool = cooler.Cooler(filepath)
#         bins = cool.bins()[:]
        
        lams, vectors = eigdecomp.cooler_cis_eig(cool, genes, regions=cis_regions, balance=balance_col,
                                                 phasing_track_col='frac_gc')
        print('Decomposition in cis is done')
        
        if do_trans:
            trans_partition = np.r_[[cool.offset(chrom) for chrom in cool.chromnames[0:23]]] #Here we ignore chrX, chrY, chrM
            trans_lam, trans_vecs = eigdecomp.cooler_trans_eig(cool, genes, partition=trans_partition, balance=balance_col,
                                                               phasing_track_col='frac_gc')
            print('Decomposition in trans is done')

            trans_lam['region'] = 'trans'
            trans_lam.set_index('region',drop=True, inplace=True)
            lams = pd.concat([lams, trans_lam])

            trans_vecs = trans_vecs[['chrom','start','end','E1','E2','E3']]
            vectors = vectors.merge(trans_vecs, on=['chrom','start','end'], how='outer',  suffixes=['_cis','_trans'])
        
        discrep = find_lam_discrepencies(lams)
        
        lam_list.append(lams)
        vector_list.append(vectors)
        discrep_list.append(discrep)
        
        if savepath is not None:
            if '.mcool' in filepath:
                filename = filepath.split(':')[0]
            filename = '.'.join(filename.split('/')[-1].split('.')[0:-2])
            save = savepath+filename+'.hdf5'
            create_dir(save)
            store = pd.HDFStore(save)
            store.put('vectors', vectors, format='table', data_columns=True)
            store.put('lams', lams, format='table', data_columns=True)
            store.close()
            
            if not os.path.isfile(savepath+'discrepencies.txt'):
                with open(savepath+'discrepencies.txt','w+') as f:
                    f.write(f"{filename}\t{', '.join(discrep)}\n")
            else:
                with open(savepath+'discrepencies.txt','a+') as f:
                    f.write(f"{filename}\t{', '.join(discrep)}\n")
                            
            print('Saved to: '+save, '\n')
            if bigwig:
                wig_save = savepath+'bigwigs/{}'.format(filename)
                save_bigwig(vectors, wig_save, genome)
        
        print('Eigendecomposition for '+filepath+': DONE!\n')
        
    return lam_list, vector_list, discrep_list
# Getting parameters for cooler_cis_eig
cool = cooler.Cooler(cool_path)
resolution = cool.info['bin-size']
genes = gene_content(genome, resolution, gc=False, fasta_path=None)

supports = {'cis': get_chromosome_arms(genome, exclude=['chrX','chrY','chrM']),
#             lams.dropna().index.map(lambda x: (x[0:x.find(':')], 
#                                                   x[x.find(':')+1:x.find('-')],
#                                                   x[x.find('-')+1:])).unique().values,
            'trans': [(chrom, 0, cool.chromsizes[chrom]) 
                               for chrom in cool.chromnames[0:22]]}
#supports

# Computing eigenvealues and eigenvectors
lams, vectors = eigdecomp.cooler_cis_eig(cool, genes, regions=supports['cis'], 
                                                   phasing_track_col='gene_count', 
                                                   sort_metric='spearmanr')
# cooler_cis_eig sorts eigenvectors by decreasing Spearman correlation with the phasing track 
# (gene count in this case). In the past, people have used the eigenvector associated with the max 
# eigenvalue. It is worth considering situations where these two sorting process differ.

# Output
contact_type='cis'
lams.to_csv(out_prefix + '.' + contact_type +'.'+  res+ '.lam.txt', sep='\t', index=False)
vectors.to_csv(out_prefix + '.' + contact_type +'.'+ res+'.vecs.tsv', sep='\t', index=False)
bioframe.to_bigwig(vectors,cool.chromsizes,out_prefix +'.' + contact_type + '.'+ res+ '.bw',value_field='E1')


exp = {}
#Cis Expected
with mp.Pool(10) as p:
import bioframe
import cooltools
import cooler
from cooltools.eigdecomp import cooler_cis_eig

mm10 = bioframe.fetch_chromsizes('mm10')
chromsizes = bioframe.fetch_chromsizes('mm10')
chromosomes = list(chromsizes.index)

binsize = 10000
bins = cooler.binnify(mm10, binsize)
fasta_records = bioframe.load_fasta('/data05/genomes/mm10_20chr.fa')
bins['GC'] = bioframe.tools.frac_gc(bins, fasta_records)
bins.head()

import fnmatch
import os

for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*_10kb.cool'):
        clr = cooler.Cooler(file)
        cond = file.split('.')[0]
        lam, eigs = cooler_cis_eig(clr,
                                   bins,
                                   n_eigs=3,
                                   phasing_track_col='GC',
                                   sort_metric='var_explained')
        # Save text files
        lam.to_csv(f'./{cond}.eigs.cis.lam.txt', sep='\t')
        eigs.to_csv(f'./{cond}.eigs.cis.vecs.txt', sep='\t', index=False)