def sample_get_rawfile_detail(sample): from pymisca.events import LinkEvent from pymisca.ext import f as _f node = pyext.file__asModule('/home/feng/envs/0726-polyq/src/validate_fastq.py') node.rawMeta = rawMeta() node.DATA_ACC = sample['data_acc'] node.WORKDIR = WORKDIR() # pyext.path.Path('/home/feng/envs/0726-polyq/WORKDIR.submit/').realpath() # node.WORKDIR = pyext.path.Path('/home/feng/envs/0830-polyq/WORKDIR/').realpath() node.valid_fastq() sample.rawfile_nodes = nodes = node.combined_valid_fastq()['OUTPUT_NODES'] sample.rawfile_files_orig = [x['OUTPUT_FILE'] for x in nodes] # sample.rawfile_files = [x['OUTPUT_FILE'].relpath(WORKDIR()) for x in nodes] #### Relinking because GEO needs a flat directory tree sample.rawfile_files = [ LinkEvent( x['OUTPUT_FILE'], WORKDIR()/"ftp"/_f('{sample.data_acc}.{x["OUTPUT_FILE"].basename()}'), 1,).dest.relpath(WORKDIR()/"ftp") for x in nodes] # print nodes[0]._data.keys() sample.rawfile_checksums = [x['FILE_MD5']['MD5_HEX'] for x in nodes] sample.rawfile_readlengths = [ '75' for x in nodes] sample.rawfile_is_paired = 'paired-end' if len(nodes) > 1 else 'single' template = u''' !Sample_raw_file_name = {{','.join(sample.rawfile_files)}} !Sample_raw_file_type = fastq !Sample_raw_file_checksum = {{','.join(sample.rawfile_checksums)}} !Sample_raw_file_read_length = {{','.join(sample.rawfile_readlengths)}} !Sample_raw_file_single_or_paired-end = {{sample.rawfile_is_paired}} !Sample_raw_file_instrument_model = NextSeq 500 ''' return pyext.jf2(template)
def _worker(sample): sample = attrdict.AttrDict(sample) sample.title = "_".join([sample[k] for k in "data_acc,age,tissue,genotype,ztime,temperature".split(",")]) res = res= pyext.jf2( template_common()) res = '\n'.join([x.strip() for x in res.splitlines()]) sample.soft_text = res pyext.printlines([sample.soft_text], OUTDIR / pyext.f("{sample.data_acc}.soft.txt"))
def sample_template_finalise(sample): sample_template_find_curated(sample) sample.expt_type = sample_get_expt_type(sample) if sample['expt_type'] == 'CHIPSEQ': sample['processing_protocol']= sample_chipseq_processing_protocol(sample) elif sample['expt_type'] == 'RNASEQ': sample['processing_protocol']= sample_rnaseq_processing_protocol(sample) sample['rawfile_detail'] = sample_get_rawfile_detail(sample) sample['template_final'] = res = pyext.jf2(sample['template_curated']) return res
def sample_rnaseq_processing_protocol(sample): from pymisca.events import CopyEvent,LinkEvent from pymisca.ext import f as _f import os OUTDIR = WORKDIR() / sample['data_acc']/ 'supp' # sample.data_acc_control = _get_data_acc_control(sample) rec = df_mappedData_rnaseq().loc[sample['data_acc']] for attrName,key in [ ('file_count','txt'), ('file_bam','bam'), # ('file_npk','narrowPeak') ]: fullname= rec[key] sample[attrName+'_orig'] = fullname if pyext.pd.isnull(fullname): sample[attrName] = 'NA' else: basename = os.path.basename(fullname) sample[attrName] = LinkEvent( CopyEvent(fullname, OUTDIR / basename ).dest, WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"), 1 ).dest.relpath(WORKDIR() / 'ftp') # sample.file_count = CopyEvent( rec['txt'], # OUTDIR / os.path.basename(rec['txt']) # ).dest.relpath(WORKDIR()) # sample.file_bam = CopyEvent(rec['bam'], # OUTDIR / os.path.basename(rec['bam']) # ).dest.relpath(WORKDIR()) template = u''' !Sample_data_processing = Raw fastq were uploaded to Bluebee Genomics Platform and analysed with \ Quantseq FWD analysis pipeline (https://www.lexogen.com/quantseq-data-analysis/). Briefly, the raw reads \ were trimmed with BBDuk and aligned to GTF-annotated genome with STAR. !Sample_data_processing = Supplementary_files_format_and_content: *.txt: TSV table containing abundance of transcripts by STAR. !Sample_data_processing = Supplementary_files_format_and_content: *.bam: Genomic alignment by STAR. !Sample_supplementary_file_1 = {{sample.file_bam}} !Sample_supplementary_file_2 = {{sample.file_count}} ''' res = pyext.jf2(template,) return res
print(str(e)) template = u''' ^SERIES = 0829-polyq !Series_title = RNA-Seq and ChIP-Seq profiling of ELF3, an prion-like domain-containig in ELF3 that functions as a thermosensor in Arabidopsis. !Series_summary = Temperature is a major environmental variable governing plant growth and development. ELF3 contains a polyglutamine (polyQ) repeat 8–10, embedded within a predicted prion domain (PrD). We find the length of the polyQ repeat correlates with thermal responsiveness. Plants from hotter climates appear to have lost the PrD domain, and these versions of ELF3 are stable at high temperature and lack thermal responsiveness. ELF3 temperature sensitivity is also modulated by the levels of ELF4, indicating that ELF4 can stabilise ELF3 function. This RNA-Seq dataset provides evidence for the hypothetical ELF3 function of temperature sensing . !Series_overall design = Single samples were taken at each time point. RNA-Seqs and ChIP-Seqs were performed for different genotypes at different temperature and objective time. !Series_contributor = Jaehoon Jung !Series_contributor = Katja, Jaeger !Series_contributor = Feng, Geng {% for sample in _samples %} !Series_sample_id = {{sample.data_acc}}{% endfor %} ''' res = pyext.jf2(template) pyext.printlines([res], OUTDIR / pyext.f("SERIES.soft.txt")) from pymisca import shell OFNAME = "0830-polyq-submit.soft" shell.shellexec( " ".join([ "cd", OUTDIR, "&&cat", "SERIES.soft.txt", "*autofill*", "|grep", "-v", "^#",
#import jinja2 #from pymisca.ext from pymisca.ext import jf2 import jinja2 _open = open with open("README.md", 'w') as f: s = jf2(open('README.md.template', 'r').read(), ) f.write(s) #.render(**locals())
def sample_chipseq_processing_protocol(sample): from pymisca.events import CopyEvent, LinkEvent from pymisca.ext import f as _f import os def _get_data_acc_control(sample): buf = ''' 198C,195CS13 176C,176CS21 182C,176CS21 189C,176CS21 192C,192CS19 '''.replace(' ', '') mapper = dict( [x.strip().split(',') for x in buf.splitlines() if x.strip()]) res = mapper.get(sample.data_acc.split('S')[0]) return res OUTDIR = WORKDIR() / sample['data_acc'] / 'supp' sample.data_acc_control = _get_data_acc_control(sample) rec = df_mappedData_chipseq().loc[sample['data_acc']] # fullname= rec['bw'] # basename = os.path.basename(fullname) # sample.file_bw = LinkEvent( CopyEvent( fullname, # OUTDIR / basename, # ).dest, # WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"), # 1,).dest.relpath(WORKDIR()/ 'ftp') for attrName, key in [('file_bw', 'bw'), ('file_bam', 'bam'), ('file_npk', 'narrowPeak')]: fullname = rec[key] sample[attrName + '_orig'] = fullname if pyext.pd.isnull(fullname): sample[attrName] = 'NA' else: basename = os.path.basename(fullname) sample[attrName] = LinkEvent( CopyEvent(fullname, OUTDIR / basename).dest, WORKDIR() / _f("ftp/{sample.data_acc}.supp.{basename}"), 1).dest.relpath(WORKDIR() / 'ftp') # if not len(rec['narrowPeak']): # if pyext.pd.isnull(rec['narrowPeak']): # sample.file_npk = 'NA' # else: # fullname= rec['narrowPeak'] # basename = os.path.basename(fullname) # sample.file_npk = LinkEvent( # CopyEvent(fullname, # OUTDIR / basename, # ).dest, # WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"), # 1 # ).dest.relpath(WORKDIR()/'ftp') template = u''' !Sample_data_processing = Adapters were trimmed off from raw reads with Trimmomatic with argument "ILLUMINACLIP:$FA_ADAPTER:6:30:10 LEADING:3 TRAILING:3 MINLEN:36 SLIDINGWINDOW:4:15". \ Raw reads were mapped to the genome "TAIR10" with Bowtie2 under argument:"--no-mixed --no-discordant --no-unal -k2". Any read that mapped to more than one genomic location was discarded. \ PCR duplicate reads were removed with Picard using default setting. !Sample_data_processing = Genomic binding profile was quantified in RPKM (Reads Per Kilobase per Million mapped reads) using a bin-size of 10bp. "deeptools.bamCoverage" is used. !Sample_data_processing = For each treated ChIP-Seq library, peaks were called against a control {{sample.data_acc_control}} using MACS2 with argument "--keep-dup 1 -p 0.1". !Sample_data_processing = Supplementary_files_format_and_content: *.bam: Genomic alignements that were sorted, deduplicated and filtered for uniq-mapped reads. !Sample_data_processing = Supplementary_files_format_and_content: *_RPKM.bw: RPKM-normalised bigwig track at 10bp resolution !Sample_data_processing = Supplementary_files_format_and_content: *.narrowPeak: containing MACS2-called peaks. as described !Sample_supplementary_file_1 = {{sample.file_bam}} !Sample_supplementary_file_2 = {{sample.file_bw}} !Sample_supplementary_file_3 = {{sample.file_npk}} ''' res = pyext.jf2(template, ) return res
def sample_rnaseq_processing_protocol(sample): from pymisca.events import CopyEvent,LinkEvent from pymisca.ext import f as _f import os OUTDIR = WORKDIR() / sample['data_acc']/ 'supp' # sample.data_acc_control = _get_data_acc_control(sample) rec = df_mappedData_rnaseq().loc[sample['data_acc']] for attrName,key in [ ('file_bam','bam'), ('file_bw','bw'), ('file_count','count'), ('file_ct','ct'), # ('file_count','txt'), # ('file_npk','narrowPeak') ]: fullname= rec[key] sample[attrName+'_orig'] = fullname if pyext.pd.isnull(fullname) or (fullname ==[]): sample[attrName] = 'NA' else: basename = os.path.basename(fullname) sample[attrName] = LinkEvent( CopyEvent(fullname,OUTDIR / basename).dest, WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"),1 ).dest.relpath(WORKDIR() / 'ftp') if sample['file_ct'] == 'NA': assert sample['file_count'] != 'NA' template = u''' !Sample_data_processing = Adapters were trimmed off from raw reads with Trimmomatic with argument "ILLUMINACLIP:$FA_ADAPTER:6:30:10 LEADING:3 TRAILING:3 MINLEN:36 SLIDINGWINDOW:4:15". !Sample_data_processing = Raw reads were aligned with Hisat2 with arguments "--no-mixed --rna-strandness RF --dta --fr" to produce a SAM file. !Sample_data_processing = Duplicate reads were removed with Picard using default setting !Sample_data_processing = Alignments in SAM file were assembled into transcripts abundances with stringtie with argument "--rf". !Sample_data_processing = Supplementary_files_format_and_content: .bam: HISAT2-aligned and picard deduplicated genomic alignment . !Sample_data_processing = Supplementary_files_format_and_content: .stringtie.count: TSV table containing abundance of transcripts with bed-formatted coordinates. !Sample_data_processing = Supplementary_files_format_and_content: RPKM.bw: RPKM normalised bigwig files !Sample_supplementary_file_1 = {{sample.file_bam}} !Sample_supplementary_file_2 = {{sample.file_count}} !Sample_supplementary_file_3 = {{sample.file_bw}} ''' else: assert sample['file_ct'] != 'NA' template = u''' !Sample_data_processing = Adapters were trimmed off using Trimmomatic with "ILLUMINACLIP:$FA_ADAPTER:2:10:5:1" !Sample_data_processing = The trimmed reads were aligned using Tophat with "--max-multihits --library-type fr-firststrand --no-mixed" !Sample_data_processing = Duplicate reads were removed with Picard using default setting !Sample_data_processing = Alignments in SAM file were assembled into transcripts abundances using htseq-count with "-r name -s no -f bam -t exon -i gene_id" against the GTF annotation" !Sample_data_processing = Supplementary_files_format_and_content: *.bam: Tophat-aligned and picard deduplicated genomic alignment . !Sample_data_processing = Supplementary_files_format_and_content: _htseq_count.ct: TSV table containing htseq-counted transcript abundances. !Sample_supplementary_file_1 = {{sample.file_bam}} !Sample_supplementary_file_2 = {{sample.file_ct}} ''' res = pyext.jf2(template,) return res