def fixGTF(FA_FILE, GTF_FILE, inplace=True, silent=1): pyext.shellexec('cp -n %s{,.bak}' % GTF_FILE, silent=silent) fa = fasta__getDefLine(FA_FILE) # pyext.shellexec("cut -d'\t' -f1 %s" % GTF_FILE) fa = [x.split()[0] for x in fa] gtf = set( pyext.read__buffer(pyext.shellexec("cut -d'\t' -f1 %s.bak" % GTF_FILE, silent=silent), ext='it'), ) gtf = [x.strip() for x in gtf] mapper = pyext.collections.OrderedDict() for gtf_ in gtf: worker = pyext.functools.partial(pyext.str__longestCommonSubstring, string2=gtf_) res = [x.size for x in map(worker, fa)] MAX = np.max(res) EQ = res == MAX assert sum(EQ) == 1,\ 'multiple matches found for GTF key:%s in %s'% (gtf_ , fa) assert MAX==len(gtf_),\ 'no match found for GTF key:%s' %gtf_ mapper[gtf_] = fa[np.argmax(EQ)] if not inplace: ofname = pyext.sys.stdout else: ofname = GTF_FILE if 0: gtf = pyext.readData(GTF_FILE + '.bak', ext='tsv', header=None) gtf = gtf.rename(index=mapper) ofname = pyext.to_tsv(gtf, ofname, index=1) if 1: it = pyext.readData(GTF_FILE + '.bak', ext='it') def worker(line): sp = line.split(u'\t') sp[0] = mapper.get(sp[0]) line = u'\t'.join(sp) return line it = (worker(line) for line in it) pyext.iter__toFile(it=it, fname=ofname) if inplace: return ofname
def BUFFER_RNASEQ(): lst = [ ("/home/feng/envs/0726-polyq/src/BOXPLOT_1_OX-lines.tsv", '20190822'), # .dropna().index.tolist(), ("/home/feng/envs/0726-polyq/src/BOXPLOT2_mutants.tsv", '20190822'), # .dropna().index.tolist(), ("/home/feng/envs/0726-polyq/src/BOXPLOT3_Qlines.tsv", '20190822'), ("/home/feng/envs/0726-polyq/src/FIG-S5_HEATMAP2_genotypes.tsv", "20190906"), ("/home/feng/envs/0726-polyq/src/FIG-S6_HEATMAP1_Q-lines.tsv", "20190906"), ("/home/feng/envs/0726-polyq/src/FIG-S7_HEATMAP3.tsv", "20190906"), # .dropna().index.tolist(), ] buf = [] for FNAME, DATE in lst: figureName = pyext.os.path.basename(FNAME) df = pyext.readData(FNAME).dropna() # df['POSITION_IN_GRAPH'] = -1 buf += list( zip( df.index, itertools.repeat(figureName), itertools.repeat(DATE), df.reset_index().index + 1, # dfPOSITION_IN_GRAPH'], )) # lst = pyext.stringList__flatten(lst) # lst = list(set(lst)) return buf
def fasta__getDefLine(FA_FILE, baseFile=0): it = pyext.readData(FA_FILE, baseFile=baseFile, ext='it') res = [] for line in it: if line.startswith('>'): res += [line[1:].strip()] return res
def rnaseq(self): rnaseq = rnaseq_raw = pyext.readData(self.input_rnaseq) # rnaseq = rnaseq_raw = pyext.readData(pyext.f('{SRCDIR}/static.envs.Fig_POLYQ.rnaseq.pk')) rnaseq = rnaseq.copy() rnaseq.loc[:] = rnaseq.apply(pyext.log2p1) # rnaseq=rnaseq # job['rnaseq'] = rnaseq return rnaseq
def from__file(FNAME, job=job): job = job.copy() df = pyext.readData('/home/feng/meta/meta_chip.tsv') df['DATA_ACC'] = df.index job['SOURCE_DF'] = df job['init'] job['get__chipseq__bwfile']('192CS17') return job
def sample_get_depth(sample, OUTPUT_FILE, FORCE=0): if not FORCE and pyext.file__notEmpty(OUTPUT_FILE): pass else: d = pyext._DICT_CLASS() d['DATA_ACC']=sample['data_acc'] CMD = ["gzip","-d<",sample['rawfile_files_orig'][0],'|wc','-l'] res = pyext.shellexec(' '.join(CMD)) d['READ_COUNT_RAW'] = int(res.strip())//4 CMD = ["cat",sample["file_bam_orig"], "|samtools","view","-F0x4","-F0x100","-c", ] res = pyext.shellexec(' '.join(CMD)) d['READ_COUNT_UNIQ_MAPPED'] = int(res.strip()) with open(OUTPUT_FILE,'w') as f: pyext.json.dump(d, f,indent=4) return pyext.readData(OUTPUT_FILE)
def main(fname, ofname=None): if ofname is None: ofname = os.path.basename(fname) + '.count' # if os.path.exists(fname): dfc = pyext.readData( fname, columns=['gene_id', 'unstranded', 'forward', 'reverse']) BEST = dfc.head(4).sum(axis=0).sort_values().index[0] # dfcc= dfc[BEST].tail(-4) # dfcc = d dfcc = dfc[BEST].to_frame('rawCount') dfcc['CPM'] = dfcc['rawCount'] / dfcc['rawCount'].sum() * 10**6 dfcc.index.name = 'gene_id' dfcc.to_csv(ofname, sep='\t') # print ('[CMD]:%s/FILE.json'%os.path.dirname(ofname) pyext.util__fileDict.main(ofname='%s/FILE.json' % os.path.dirname(ofname), argD=dict(STAR_BESTCOL=BEST)) return ofname
def BUFFER_CHIPSEQ(): ''' See: 0726-polyq/src/make_chipseq_pileups.py ''' buf = [ ("192CS17","fig-2c","20190822"), ("192CS18","fig-2c","20190822"), ("192CS1","fig-2d","20190822"), ("192CS2","fig-2d","20190822"), ("192CS3","fig-2d","20190822"), ("192CS4","fig-2d","20190822"), ("189CS10","fig-2e","20190905"), ("189CS11","fig-2e","20190905"), ("189CS16","fig-2e","20190905"), ("189CS17","fig-2e","20190905"), # "192CS10", # "192CS11", # "192CS16", # "192CS17", ] # for FNAME in lst: if 1: FNAME= INPUTDIR() /'src/0726-figure-meta.tsv' figureName = pyext.os.path.basename(FNAME) figureName = 'figS4E_0905' df = pyext.readData(FNAME)[['figS4E_0905']].dropna() df.columns = ['POSITION_IN_GRAPH'] buf += list(zip( df.index, itertools.repeat(figureName), itertools.repeat('20190905'), df['POSITION_IN_GRAPH'] )) return buf
''' Loading datasets for PIF7 ''' from path import Path import synotil.CountMatrix as scount import pymisca.util as pyutil import pymisca.ext as pyext import pandas as pd with Path(__file__).realpath().dirname().dirname() as d: # execfile('./header_import.py') meta = pyext.readData('deps/meta_rna.csv') meta = meta.loc[~meta.index.isna()] meta = meta.fillna('NA') keyDF = pyext.readData('deps/key_ath.csv') rnaseq = pyext.readData('deps/rnaseq_log2p1.pk') assert 'Age' in meta # execfile('/home/feng/meta/header_0903.py') # # meta = pyutil.readData( 'deps/meta_rna.csv' ) # meta = pyutil.readData('/home/feng/meta/meta_rna.tsv') # meta = meta.loc[~meta.index.isna()] # keyDF = pyutil.readData( 'deps/key_ath.csv' ) # rnaseq = pyutil.readData('deps/rnaseq_log2p1.pk') # meta = pyutil.readData('/home/feng/meta/meta_rna.tsv' ) # rnaseq= pyutil.readData('/home/feng/envs/Fig_PIF/1031__rnaseq__log2p1.pk')
"figsize":[5,5]}, ),{},), plotters.venn_diagram(dict( OFNAME = "venn-diagram-1.svg", comment = ''' This plot shows overlap between rna-based pif7-dependent genes and rna-based temperature-dependent genes. The selection criteria can be found in "src/0224_venn_pif7.py" ''', index1=pyext.readData(DIR/'0224_venn_pif7.py.result/Venn-index.csv')['ind1'], index2=pyext.readData(DIR/'0224_venn_pif7.py.result/Venn-index.csv')['ind2'], axis={ "xlabel":"tempResponsive", "ylabel":"PIF7_Dependent", }, ),{}), # plotters.venn_diagram(dict( # OFNAME = "venn-diagram-2.svg", # comment = ''' # This plot shows overlap between # rna-based pif7-responsive genes # and # chipseq-based PIF7-bound genes
def markers_df(self): keyDF = pyext.readData(self.input_markers_df) markers = ['LUX'] markers_df = keyDFC = keyDF.query('BioName in %s'%markers) return markers_df
def _func(self, key, datasets_meta, rnaseq): df = pyext.readData(FNAME).dropna() return rnaseq.reindex(columns=df.index)
def _readData(fn,**kw): return pyext.readData(Path(__file__).dirname()/fn,encoding='utf8',**kw)
u'185RS10', u'185RS13', u'185RS16', u'185RS19', u'185RS23', u'185RS25', u'185RS27', u'185RS28' ] { 'header_import.py': 'import packages', 'deps/key_ath.csv': 'information of key genes of interest', 'deps/meta_rna.csv': 'meta information for the concerning RNA experiemnt', 'deps/rnaseq_log2p1.pk': 'RNASEQ count table /home/feng/envs/Fig_PIF/1031__rnaseq__log2p1.pk', 'deps/Templates.listImages.html': 'HTML template', } OF = 'deps/meta_rna.csv' IF = '/home/feng/meta/meta_rna.tsv' df = pyext.readData(IF, guess_index=1) df = df.loc[~df.index.isna()] df.reindex(datasets_index).to_csv(OF) print(OF) OF = 'deps/key_ath.csv' IF = '/home/feng/meta/key_ath.csv' with open(OF, 'w') as f: f.write(open(IF, 'r').read()) print(OF) OF = 'deps/rnaseq_log2p1.pk' IF = '/home/feng/envs/Fig_PIF/1031__rnaseq__log2p1.pk' shutil.copy2(IF, OF) print(OF)
import pymisca.ext as pyext import src.rnaseq_figure import src.util as _util df = pyext.OrderedDict() df["signature_targets"] = (src.rnaseq_figure.job['signature_targets']) df["chipseq_targets_genes"] = pyext.readData( "OUTPUT/chipseq_targets_genes_job.peak_list.csv")['feat_acc'].unique() # df = pyext.pd.DataFrame(map(pyext.pd.Series(df.items()))) df = pyext.pd.DataFrame.from_dict(df, orient='index').T df.to_csv(_util._get_output_file("OUTPUT/gene_lists_dataframe.csv"), index=0)
def visualise_datasets(self): df = pyext.readData(self.FNAME).dropna() return self.rnaseq.reindex(columns=df.index)
def _func(job, key, chipseq_targets_genes_job): return list( pyext.readData(chipseq_targets_genes_job['LAST_DIR'] / "OUT.it"))
import pymisca.ext as pyext pd = pyext.pd np = pyext.np import os SRCDIR = os.path.dirname(__file__) from lazydict import LazyDictionary rnaseq_figure = template = job = LazyDictionary() # with pyext.getPathStack([]) as stack: # ! ls -lhtr *.pk from util import _get_file ############################### #### Data loading ############# rnaseq = rnaseq_raw = pyext.readData( _get_file('/home/feng/envs/Fig_POLYQ/rnaseq.pk')) # rnaseq = rnaseq_raw = pyext.readData(pyext.f('{SRCDIR}/static.envs.Fig_POLYQ.rnaseq.pk')) rnaseq = rnaseq.copy() rnaseq.loc[:] = rnaseq.apply(pyext.log2p1) job['rnaseq'] = rnaseq mcurr0 = pyext.readData( _get_file('/home/feng/static/results/0318-makeRNA-polyQ/mcurr0.csv')) # mcurr0 = pyext.readData( pyext.f('{SRCDIR}/static.results.0318-makeRNA-polyQ.mcurr0.csv') ) mcurr0.columns = mcurr0.columns.str.upper() mcurr0['DISP_NAME'] = pyext.df__format(mcurr0, '{TEMP}-{ZTIME}-{GTYPE}-{index}') job['datasets_meta'] = mcurr0 keyDF = pyext.readData(pyext.f('{SRCDIR}/key_ath.csv')) markers = ['LUX']
def WORKDIR(): return pyext.path.Path('$HOME/envs/0830-polyq/WORKDIR').expand().realpath() def WORKDIR(): return pyext.path.Path('$PWD/get_meta_soft.work').expand().realpath() # HOME/envs/0726-polyq/WORKDIR.submit/').expand().realpath() return pyext.path.Path( '$HOME/envs/0726-polyq/WORKDIR.submit/').expand().realpath() #### Overwrite file-reading stream from util import _get_file _readData = lambda x, **kw: pyext.readData(_get_file(x), **kw) # def DATA_ACC_RNASEQ(): # lst =[ # _readData("/home/feng/envs/0726-polyq/src/BOXPLOT_1_OX-lines.tsv").dropna().index.tolist(), # _readData("/home/feng/envs/0726-polyq/src/BOXPLOT2_mutants.tsv").dropna().index.tolist(), # _readData("/home/feng/envs/0726-polyq/src/BOXPLOT3_Qlines.tsv").dropna().index.tolist(), # ] # lst = pyext.stringList__flatten(lst) # lst = list(set(lst)) # return lst # def ACC2BNAME(): # return MCURR0 = _readData('upGeo/0312-meta-copy/meta_rna.tsv', guess_index=0)
def sample_template_find_curated(sample): sample['template_curated'] = res = u''.join( pyext.readData( WORKDIR() / 'get_soft_text' / sample['data_acc'] + '.soft.txt', 'it')) return res
'OUTPUT/chipseq_differential_binding.peak_list.bed') outputs.append(OUTPUT_BED_FILE + '.summit') OUTPUT_CSV_FILE = _get_output_file( "OUTPUT/chipseq_differential_binding.peak_list.csv") OUTPUT_CSV_GENE_FILE = _get_output_file( "OUTPUT/chipseq_targets_genes_job.peak_list.csv") if '--print-outputs' in sys.argv: for x in outputs: print(x) sys.exit(0) if '--run' not in sys.argv: sys.exit(1) df = pyext.readData(npkFile, 'tsv', header=None, columns=pyext.columns.bed) df['FC'].apply(pyext.np.log2).hist(bins=30) sel = df['neglogPval'] > 3. print(pyext.np.sum(sel)) df = df.loc[sel].to_csv('temp.bed', sep='\t', header=None) [bwFiles] res = synotil.dio.extract_bigwig_multiple(bedFile='temp.bed', outIndex=DATA_ACC_LIST, bwFiles=bwFiles, radius=300, stepSize=10) tab = colGroupMean(res) tab = tab.apply(pyext.log2p1) xs = (tab['189CS10']) - (tab['189CS11'])
def datasets_meta(self): mcurr0 = pyext.readData(self.input_datasets_meta) # mcurr0 = pyext.readData( pyext.f('{SRCDIR}/static.results.0318-makeRNA-polyQ.mcurr0.csv') ) mcurr0.columns = mcurr0.columns.str.upper() mcurr0['DISP_NAME']= pyext.df__format(mcurr0,'{TEMP}-{ZTIME}-{GTYPE}-{index}') return mcurr0