コード例 #1
0
def fixGTF(FA_FILE, GTF_FILE, inplace=True, silent=1):
    pyext.shellexec('cp -n %s{,.bak}' % GTF_FILE, silent=silent)

    fa = fasta__getDefLine(FA_FILE)
    #     pyext.shellexec("cut -d'\t' -f1 %s" % GTF_FILE)
    fa = [x.split()[0] for x in fa]
    gtf = set(
        pyext.read__buffer(pyext.shellexec("cut -d'\t' -f1 %s.bak" % GTF_FILE,
                                           silent=silent),
                           ext='it'), )
    gtf = [x.strip() for x in gtf]

    mapper = pyext.collections.OrderedDict()
    for gtf_ in gtf:
        worker = pyext.functools.partial(pyext.str__longestCommonSubstring,
                                         string2=gtf_)
        res = [x.size for x in map(worker, fa)]
        MAX = np.max(res)
        EQ = res == MAX
        assert sum(EQ) == 1,\
        'multiple matches found for GTF key:%s in %s'% (gtf_ , fa)
        assert MAX==len(gtf_),\
        'no match found for GTF key:%s' %gtf_
        mapper[gtf_] = fa[np.argmax(EQ)]

    if not inplace:
        ofname = pyext.sys.stdout
    else:
        ofname = GTF_FILE

    if 0:
        gtf = pyext.readData(GTF_FILE + '.bak', ext='tsv', header=None)
        gtf = gtf.rename(index=mapper)
        ofname = pyext.to_tsv(gtf, ofname, index=1)

    if 1:
        it = pyext.readData(GTF_FILE + '.bak', ext='it')

        def worker(line):
            sp = line.split(u'\t')
            sp[0] = mapper.get(sp[0])
            line = u'\t'.join(sp)
            return line

        it = (worker(line) for line in it)
        pyext.iter__toFile(it=it, fname=ofname)

    if inplace:
        return ofname
コード例 #2
0
def BUFFER_RNASEQ():
    lst = [
        ("/home/feng/envs/0726-polyq/src/BOXPLOT_1_OX-lines.tsv", '20190822'),
        #         .dropna().index.tolist(),
        ("/home/feng/envs/0726-polyq/src/BOXPLOT2_mutants.tsv", '20190822'),
        #         .dropna().index.tolist(),
        ("/home/feng/envs/0726-polyq/src/BOXPLOT3_Qlines.tsv", '20190822'),
        ("/home/feng/envs/0726-polyq/src/FIG-S5_HEATMAP2_genotypes.tsv",
         "20190906"),
        ("/home/feng/envs/0726-polyq/src/FIG-S6_HEATMAP1_Q-lines.tsv",
         "20190906"),
        ("/home/feng/envs/0726-polyq/src/FIG-S7_HEATMAP3.tsv",
         "20190906"),  #         .dropna().index.tolist(),
    ]
    buf = []
    for FNAME, DATE in lst:
        figureName = pyext.os.path.basename(FNAME)
        df = pyext.readData(FNAME).dropna()
        #         df['POSITION_IN_GRAPH'] = -1
        buf += list(
            zip(
                df.index,
                itertools.repeat(figureName),
                itertools.repeat(DATE),
                df.reset_index().index + 1,
                #                 dfPOSITION_IN_GRAPH'],
            ))


#     lst = pyext.stringList__flatten(lst)
#     lst = list(set(lst))
    return buf
コード例 #3
0
def fasta__getDefLine(FA_FILE, baseFile=0):
    it = pyext.readData(FA_FILE, baseFile=baseFile, ext='it')
    res = []
    for line in it:
        if line.startswith('>'):
            res += [line[1:].strip()]
    return res
コード例 #4
0
 def rnaseq(self):
     rnaseq = rnaseq_raw = pyext.readData(self.input_rnaseq)
     # rnaseq = rnaseq_raw = pyext.readData(pyext.f('{SRCDIR}/static.envs.Fig_POLYQ.rnaseq.pk'))
     rnaseq = rnaseq.copy()
     rnaseq.loc[:] = rnaseq.apply(pyext.log2p1)
 # rnaseq=rnaseq 
 # job['rnaseq'] = rnaseq
     return rnaseq
コード例 #5
0
def from__file(FNAME, job=job):
    job = job.copy()
    df = pyext.readData('/home/feng/meta/meta_chip.tsv')
    df['DATA_ACC'] = df.index
    job['SOURCE_DF'] = df
    job['init']
    job['get__chipseq__bwfile']('192CS17')
    return job
コード例 #6
0
def sample_get_depth(sample,
                     OUTPUT_FILE,
                     FORCE=0):
    if not FORCE and pyext.file__notEmpty(OUTPUT_FILE):
        pass
    else:
        d = pyext._DICT_CLASS()
        d['DATA_ACC']=sample['data_acc']
        CMD = ["gzip","-d<",sample['rawfile_files_orig'][0],'|wc','-l']
        res = pyext.shellexec(' '.join(CMD))
        d['READ_COUNT_RAW'] = int(res.strip())//4
        CMD = ["cat",sample["file_bam_orig"],
               "|samtools","view","-F0x4","-F0x100","-c",
              ]
        res = pyext.shellexec(' '.join(CMD))        
        d['READ_COUNT_UNIQ_MAPPED'] = int(res.strip())
        with open(OUTPUT_FILE,'w') as f:
            pyext.json.dump(d, f,indent=4)
    return pyext.readData(OUTPUT_FILE)
コード例 #7
0
ファイル: getCPM4STAR.py プロジェクト: shouldsee/synoBio
def main(fname, ofname=None):
    if ofname is None:
        ofname = os.path.basename(fname) + '.count'


#     if os.path.exists(fname):

    dfc = pyext.readData(
        fname, columns=['gene_id', 'unstranded', 'forward', 'reverse'])
    BEST = dfc.head(4).sum(axis=0).sort_values().index[0]
    # dfcc=  dfc[BEST].tail(-4)
    # dfcc = d
    dfcc = dfc[BEST].to_frame('rawCount')
    dfcc['CPM'] = dfcc['rawCount'] / dfcc['rawCount'].sum() * 10**6
    dfcc.index.name = 'gene_id'
    dfcc.to_csv(ofname, sep='\t')
    #     print ('[CMD]:%s/FILE.json'%os.path.dirname(ofname)
    pyext.util__fileDict.main(ofname='%s/FILE.json' % os.path.dirname(ofname),
                              argD=dict(STAR_BESTCOL=BEST))
    return ofname
コード例 #8
0
def BUFFER_CHIPSEQ():
    '''
    See: 0726-polyq/src/make_chipseq_pileups.py
    '''
    buf = [
        
        ("192CS17","fig-2c","20190822"),
        ("192CS18","fig-2c","20190822"),
        
        ("192CS1","fig-2d","20190822"),
        ("192CS2","fig-2d","20190822"),
        ("192CS3","fig-2d","20190822"),
        ("192CS4","fig-2d","20190822"),
        
        ("189CS10","fig-2e","20190905"),
        ("189CS11","fig-2e","20190905"),
        ("189CS16","fig-2e","20190905"),
        ("189CS17","fig-2e","20190905"),
        
#         "192CS10",
#         "192CS11",
#         "192CS16",
#         "192CS17",
    ]
    
#     for FNAME in lst:
    if 1:
        FNAME=  INPUTDIR() /'src/0726-figure-meta.tsv'
        figureName = pyext.os.path.basename(FNAME)
        figureName = 'figS4E_0905'
        df = pyext.readData(FNAME)[['figS4E_0905']].dropna()
        df.columns = ['POSITION_IN_GRAPH']
        buf += list(zip(
                 df.index,
                 itertools.repeat(figureName),
                 itertools.repeat('20190905'),
                 df['POSITION_IN_GRAPH']
        ))
    return buf
コード例 #9
0
ファイル: loadRNA_Ath.py プロジェクト: shouldsee/thermoPIF7
'''
Loading datasets for PIF7
'''

from path import Path
import synotil.CountMatrix as scount
import pymisca.util as pyutil
import pymisca.ext as pyext
import pandas as pd

with Path(__file__).realpath().dirname().dirname() as d:
    # execfile('./header_import.py')

    meta = pyext.readData('deps/meta_rna.csv')
    meta = meta.loc[~meta.index.isna()]
    meta = meta.fillna('NA')
    keyDF = pyext.readData('deps/key_ath.csv')
    rnaseq = pyext.readData('deps/rnaseq_log2p1.pk')

    assert 'Age' in meta
    # execfile('/home/feng/meta/header_0903.py')

    # # meta  = pyutil.readData( 'deps/meta_rna.csv' )
    # meta = pyutil.readData('/home/feng/meta/meta_rna.tsv')
    # meta = meta.loc[~meta.index.isna()]

    # keyDF = pyutil.readData( 'deps/key_ath.csv'  )
    # rnaseq = pyutil.readData('deps/rnaseq_log2p1.pk')

    # meta = pyutil.readData('/home/feng/meta/meta_rna.tsv' )
    # rnaseq=  pyutil.readData('/home/feng/envs/Fig_PIF/1031__rnaseq__log2p1.pk')
コード例 #10
0
                "figsize":[5,5]},

            ),{},),



   plotters.venn_diagram(dict(
    OFNAME = "venn-diagram-1.svg",
    comment = '''
    This plot shows overlap between 
        rna-based pif7-dependent genes
        and 
        rna-based temperature-dependent genes.        
    The selection criteria can be found in "src/0224_venn_pif7.py"
    ''',
    index1=pyext.readData(DIR/'0224_venn_pif7.py.result/Venn-index.csv')['ind1'],
    index2=pyext.readData(DIR/'0224_venn_pif7.py.result/Venn-index.csv')['ind2'],
    axis={
         "xlabel":"tempResponsive",
         "ylabel":"PIF7_Dependent",
    },
    ),{}),


   # plotters.venn_diagram(dict(
   #  OFNAME = "venn-diagram-2.svg",
   #  comment = '''
   #  This plot shows overlap between 
   #      rna-based pif7-responsive genes 
   #      and 
   #      chipseq-based PIF7-bound genes
コード例 #11
0
 def markers_df(self):
     keyDF = pyext.readData(self.input_markers_df)
     markers = ['LUX']
     markers_df = keyDFC = keyDF.query('BioName in %s'%markers)
     return markers_df
コード例 #12
0
 def _func(self, key, datasets_meta, rnaseq):
     df = pyext.readData(FNAME).dropna()
     return rnaseq.reindex(columns=df.index)
コード例 #13
0
def _readData(fn,**kw):
    return pyext.readData(Path(__file__).dirname()/fn,encoding='utf8',**kw)
コード例 #14
0
ファイル: 0224_export.py プロジェクト: shouldsee/thermoPIF7
    u'185RS10', u'185RS13', u'185RS16', u'185RS19', u'185RS23', u'185RS25',
    u'185RS27', u'185RS28'
]

{
    'header_import.py': 'import packages',
    'deps/key_ath.csv': 'information of key genes of interest',
    'deps/meta_rna.csv': 'meta information for the concerning RNA experiemnt',
    'deps/rnaseq_log2p1.pk':
    'RNASEQ count table /home/feng/envs/Fig_PIF/1031__rnaseq__log2p1.pk',
    'deps/Templates.listImages.html': 'HTML template',
}

OF = 'deps/meta_rna.csv'
IF = '/home/feng/meta/meta_rna.tsv'
df = pyext.readData(IF, guess_index=1)
df = df.loc[~df.index.isna()]
df.reindex(datasets_index).to_csv(OF)
print(OF)

OF = 'deps/key_ath.csv'
IF = '/home/feng/meta/key_ath.csv'
with open(OF, 'w') as f:
    f.write(open(IF, 'r').read())
print(OF)

OF = 'deps/rnaseq_log2p1.pk'
IF = '/home/feng/envs/Fig_PIF/1031__rnaseq__log2p1.pk'
shutil.copy2(IF, OF)
print(OF)
コード例 #15
0
import pymisca.ext as pyext
import src.rnaseq_figure
import src.util as _util
df = pyext.OrderedDict()
df["signature_targets"] = (src.rnaseq_figure.job['signature_targets'])
df["chipseq_targets_genes"] = pyext.readData(
    "OUTPUT/chipseq_targets_genes_job.peak_list.csv")['feat_acc'].unique()
# df = pyext.pd.DataFrame(map(pyext.pd.Series(df.items())))
df = pyext.pd.DataFrame.from_dict(df, orient='index').T
df.to_csv(_util._get_output_file("OUTPUT/gene_lists_dataframe.csv"), index=0)
コード例 #16
0
 def visualise_datasets(self):
     df  = pyext.readData(self.FNAME).dropna()
     return self.rnaseq.reindex(columns=df.index)
コード例 #17
0
def _func(job, key, chipseq_targets_genes_job):
    return list(
        pyext.readData(chipseq_targets_genes_job['LAST_DIR'] / "OUT.it"))
コード例 #18
0
import pymisca.ext as pyext
pd = pyext.pd
np = pyext.np
import os
SRCDIR = os.path.dirname(__file__)

from lazydict import LazyDictionary
rnaseq_figure = template = job = LazyDictionary()

# with pyext.getPathStack([]) as stack:
#     ! ls -lhtr *.pk

from util import _get_file
###############################
#### Data loading #############
rnaseq = rnaseq_raw = pyext.readData(
    _get_file('/home/feng/envs/Fig_POLYQ/rnaseq.pk'))
# rnaseq = rnaseq_raw = pyext.readData(pyext.f('{SRCDIR}/static.envs.Fig_POLYQ.rnaseq.pk'))
rnaseq = rnaseq.copy()
rnaseq.loc[:] = rnaseq.apply(pyext.log2p1)
job['rnaseq'] = rnaseq

mcurr0 = pyext.readData(
    _get_file('/home/feng/static/results/0318-makeRNA-polyQ/mcurr0.csv'))
# mcurr0 = pyext.readData( pyext.f('{SRCDIR}/static.results.0318-makeRNA-polyQ.mcurr0.csv') )
mcurr0.columns = mcurr0.columns.str.upper()
mcurr0['DISP_NAME'] = pyext.df__format(mcurr0,
                                       '{TEMP}-{ZTIME}-{GTYPE}-{index}')
job['datasets_meta'] = mcurr0

keyDF = pyext.readData(pyext.f('{SRCDIR}/key_ath.csv'))
markers = ['LUX']
コード例 #19
0

def WORKDIR():
    return pyext.path.Path('$HOME/envs/0830-polyq/WORKDIR').expand().realpath()


def WORKDIR():
    return pyext.path.Path('$PWD/get_meta_soft.work').expand().realpath()
    # HOME/envs/0726-polyq/WORKDIR.submit/').expand().realpath()
    return pyext.path.Path(
        '$HOME/envs/0726-polyq/WORKDIR.submit/').expand().realpath()


#### Overwrite file-reading stream
from util import _get_file
_readData = lambda x, **kw: pyext.readData(_get_file(x), **kw)
# def DATA_ACC_RNASEQ():
#     lst =[
#         _readData("/home/feng/envs/0726-polyq/src/BOXPLOT_1_OX-lines.tsv").dropna().index.tolist(),
#         _readData("/home/feng/envs/0726-polyq/src/BOXPLOT2_mutants.tsv").dropna().index.tolist(),
#         _readData("/home/feng/envs/0726-polyq/src/BOXPLOT3_Qlines.tsv").dropna().index.tolist(),
#     ]
#     lst = pyext.stringList__flatten(lst)
#     lst = list(set(lst))
#     return lst

# def ACC2BNAME():
#     return

MCURR0 = _readData('upGeo/0312-meta-copy/meta_rna.tsv', guess_index=0)
コード例 #20
0
def sample_template_find_curated(sample):
    sample['template_curated'] = res = u''.join(
        pyext.readData(
            WORKDIR() / 'get_soft_text' / sample['data_acc'] + '.soft.txt',
            'it'))
    return res
コード例 #21
0
    'OUTPUT/chipseq_differential_binding.peak_list.bed')
outputs.append(OUTPUT_BED_FILE + '.summit')
OUTPUT_CSV_FILE = _get_output_file(
    "OUTPUT/chipseq_differential_binding.peak_list.csv")
OUTPUT_CSV_GENE_FILE = _get_output_file(
    "OUTPUT/chipseq_targets_genes_job.peak_list.csv")

if '--print-outputs' in sys.argv:
    for x in outputs:
        print(x)
    sys.exit(0)

if '--run' not in sys.argv:
    sys.exit(1)

df = pyext.readData(npkFile, 'tsv', header=None, columns=pyext.columns.bed)
df['FC'].apply(pyext.np.log2).hist(bins=30)
sel = df['neglogPval'] > 3.
print(pyext.np.sum(sel))
df = df.loc[sel].to_csv('temp.bed', sep='\t', header=None)

[bwFiles]
res = synotil.dio.extract_bigwig_multiple(bedFile='temp.bed',
                                          outIndex=DATA_ACC_LIST,
                                          bwFiles=bwFiles,
                                          radius=300,
                                          stepSize=10)

tab = colGroupMean(res)
tab = tab.apply(pyext.log2p1)
xs = (tab['189CS10']) - (tab['189CS11'])
コード例 #22
0
 def datasets_meta(self):
     mcurr0 = pyext.readData(self.input_datasets_meta)
     # mcurr0 = pyext.readData( pyext.f('{SRCDIR}/static.results.0318-makeRNA-polyQ.mcurr0.csv') )
     mcurr0.columns = mcurr0.columns.str.upper()
     mcurr0['DISP_NAME']= pyext.df__format(mcurr0,'{TEMP}-{ZTIME}-{GTYPE}-{index}')
     return mcurr0