Beispiel #1
0
def sample_get_rawfile_detail(sample):
    from pymisca.events import LinkEvent
    from pymisca.ext import f as _f
    node = pyext.file__asModule('/home/feng/envs/0726-polyq/src/validate_fastq.py')
    node.rawMeta = rawMeta()
    node.DATA_ACC = sample['data_acc']
    node.WORKDIR = WORKDIR()
#     pyext.path.Path('/home/feng/envs/0726-polyq/WORKDIR.submit/').realpath()
#         node.WORKDIR = pyext.path.Path('/home/feng/envs/0830-polyq/WORKDIR/').realpath()
    node.valid_fastq()
    sample.rawfile_nodes = nodes = node.combined_valid_fastq()['OUTPUT_NODES']
    sample.rawfile_files_orig = [x['OUTPUT_FILE'] for x in nodes]
#     sample.rawfile_files = [x['OUTPUT_FILE'].relpath(WORKDIR()) for x in nodes]

    #### Relinking because GEO needs a flat directory tree
    sample.rawfile_files = [
        LinkEvent(
            x['OUTPUT_FILE'],
            WORKDIR()/"ftp"/_f('{sample.data_acc}.{x["OUTPUT_FILE"].basename()}'),
            1,).dest.relpath(WORKDIR()/"ftp") for x in nodes]
    
#     print nodes[0]._data.keys()
    sample.rawfile_checksums = [x['FILE_MD5']['MD5_HEX'] for x in nodes]
    sample.rawfile_readlengths = [ '75' for x in nodes]
    sample.rawfile_is_paired = 'paired-end' if len(nodes) > 1 else 'single'
    template = u'''
!Sample_raw_file_name = {{','.join(sample.rawfile_files)}}
!Sample_raw_file_type = fastq
!Sample_raw_file_checksum = {{','.join(sample.rawfile_checksums)}}
!Sample_raw_file_read_length = {{','.join(sample.rawfile_readlengths)}}
!Sample_raw_file_single_or_paired-end = {{sample.rawfile_is_paired}}
!Sample_raw_file_instrument_model = NextSeq 500
'''
    
    return pyext.jf2(template)
Beispiel #2
0
def sample_rnaseq_processing_protocol(sample):
    from pymisca.events import CopyEvent,LinkEvent
    from pymisca.ext import f as _f
    import os
    
    OUTDIR = WORKDIR() / sample['data_acc']/ 'supp'
#     sample.data_acc_control = _get_data_acc_control(sample)
    rec = df_mappedData_rnaseq().loc[sample['data_acc']]

    for attrName,key in [
        ('file_count','txt'),
        ('file_bam','bam'),
#         ('file_npk','narrowPeak')
    ]:
        
        fullname= rec[key]
        sample[attrName+'_orig'] = fullname
        if pyext.pd.isnull(fullname):
            sample[attrName] = 'NA'
        else:
            basename = os.path.basename(fullname)
            sample[attrName] = LinkEvent(
                CopyEvent(fullname,
                OUTDIR / basename
                ).dest,
               WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"),
               1
            ).dest.relpath(WORKDIR() / 'ftp')    
    
        
#     sample.file_count = CopyEvent( rec['txt'], 
#                                OUTDIR / os.path.basename(rec['txt'])
#                               ).dest.relpath(WORKDIR())
#     sample.file_bam = CopyEvent(rec['bam'],
#                                 OUTDIR / os.path.basename(rec['bam'])
#                                ).dest.relpath(WORKDIR())
    template = u'''
!Sample_data_processing = Raw fastq were uploaded to Bluebee Genomics Platform and analysed with \
Quantseq FWD analysis pipeline (https://www.lexogen.com/quantseq-data-analysis/). Briefly, the raw reads \
were trimmed with BBDuk and aligned to GTF-annotated genome with STAR. 

!Sample_data_processing = Supplementary_files_format_and_content: *.txt: TSV table containing abundance of transcripts by STAR. 
!Sample_data_processing = Supplementary_files_format_and_content: *.bam: Genomic alignment by STAR.

!Sample_supplementary_file_1 = {{sample.file_bam}}
!Sample_supplementary_file_2 = {{sample.file_count}}
    '''
    res = pyext.jf2(template,)
    return res
Beispiel #3
0
def sample_chipseq_processing_protocol(sample):
    from pymisca.events import CopyEvent, LinkEvent
    from pymisca.ext import f as _f
    import os

    def _get_data_acc_control(sample):
        buf = '''
        198C,195CS13
        176C,176CS21
        182C,176CS21
        189C,176CS21
        192C,192CS19
        '''.replace(' ', '')
        mapper = dict(
            [x.strip().split(',') for x in buf.splitlines() if x.strip()])
        res = mapper.get(sample.data_acc.split('S')[0])
        return res

    OUTDIR = WORKDIR() / sample['data_acc'] / 'supp'
    sample.data_acc_control = _get_data_acc_control(sample)
    rec = df_mappedData_chipseq().loc[sample['data_acc']]

    #     fullname= rec['bw']
    #     basename = os.path.basename(fullname)
    #     sample.file_bw = LinkEvent( CopyEvent( fullname,
    #                                           OUTDIR / basename,
    #                                          ).dest,
    #                                WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"),
    #                               1,).dest.relpath(WORKDIR()/ 'ftp')
    for attrName, key in [('file_bw', 'bw'), ('file_bam', 'bam'),
                          ('file_npk', 'narrowPeak')]:

        fullname = rec[key]
        sample[attrName + '_orig'] = fullname
        if pyext.pd.isnull(fullname):
            sample[attrName] = 'NA'
        else:
            basename = os.path.basename(fullname)
            sample[attrName] = LinkEvent(
                CopyEvent(fullname, OUTDIR / basename).dest,
                WORKDIR() / _f("ftp/{sample.data_acc}.supp.{basename}"),
                1).dest.relpath(WORKDIR() / 'ftp')
    #     if not len(rec['narrowPeak']):


#     if pyext.pd.isnull(rec['narrowPeak']):
#         sample.file_npk = 'NA'
#     else:
#         fullname= rec['narrowPeak']
#         basename = os.path.basename(fullname)
#         sample.file_npk = LinkEvent(
#             CopyEvent(fullname,
#                       OUTDIR /  basename,
#                       ).dest,
#            WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"),
#            1
#         ).dest.relpath(WORKDIR()/'ftp')

    template = u'''
!Sample_data_processing = Adapters were trimmed off from raw reads with Trimmomatic with argument "ILLUMINACLIP:$FA_ADAPTER:6:30:10 LEADING:3 TRAILING:3 MINLEN:36 SLIDINGWINDOW:4:15". \
Raw reads were mapped to the genome "TAIR10" with Bowtie2 under argument:"--no-mixed --no-discordant --no-unal -k2". Any read that mapped to more than one genomic location was discarded. \
PCR duplicate reads were removed with Picard using default setting.
!Sample_data_processing = Genomic binding profile was quantified in RPKM (Reads Per Kilobase per Million mapped reads) using a bin-size of 10bp. "deeptools.bamCoverage" is used.
!Sample_data_processing = For each treated ChIP-Seq library, peaks were called against a control {{sample.data_acc_control}} using MACS2 with argument "--keep-dup 1 -p 0.1".
!Sample_data_processing = Supplementary_files_format_and_content: *.bam: Genomic alignements that were sorted, deduplicated and filtered for uniq-mapped reads.

!Sample_data_processing = Supplementary_files_format_and_content: *_RPKM.bw: RPKM-normalised bigwig track at 10bp resolution
!Sample_data_processing = Supplementary_files_format_and_content: *.narrowPeak: containing MACS2-called peaks. as described
!Sample_supplementary_file_1 = {{sample.file_bam}}
!Sample_supplementary_file_2 = {{sample.file_bw}}
!Sample_supplementary_file_3 = {{sample.file_npk}}
    '''
    res = pyext.jf2(template, )
    return res
def sample_rnaseq_processing_protocol(sample):
    from pymisca.events import CopyEvent,LinkEvent
    from pymisca.ext import f as _f
    import os
    
    OUTDIR = WORKDIR() / sample['data_acc']/ 'supp'
#     sample.data_acc_control = _get_data_acc_control(sample)
    rec = df_mappedData_rnaseq().loc[sample['data_acc']]

    for attrName,key in [
        ('file_bam','bam'),
        ('file_bw','bw'),
        ('file_count','count'),
        ('file_ct','ct'),
        # ('file_count','txt'),
#         ('file_npk','narrowPeak')
    ]:
        
        fullname= rec[key]
        sample[attrName+'_orig'] = fullname
        if pyext.pd.isnull(fullname) or (fullname ==[]):
            sample[attrName] = 'NA'
        else:
            basename = os.path.basename(fullname)
            sample[attrName] = LinkEvent(
                    CopyEvent(fullname,OUTDIR / basename).dest,
                    WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"),1
                ).dest.relpath(WORKDIR() / 'ftp')    
    if sample['file_ct'] == 'NA':
        assert sample['file_count'] != 'NA'
        template = u'''
    !Sample_data_processing = Adapters were trimmed off from raw reads with Trimmomatic with argument "ILLUMINACLIP:$FA_ADAPTER:6:30:10 LEADING:3 TRAILING:3 MINLEN:36 SLIDINGWINDOW:4:15". 
    !Sample_data_processing = Raw reads were aligned with Hisat2 with arguments "--no-mixed --rna-strandness RF --dta --fr" to produce a SAM file.
    !Sample_data_processing = Duplicate reads were removed with Picard using default
    setting
    !Sample_data_processing = Alignments in SAM file were assembled into transcripts abundances with stringtie with argument "--rf".


    !Sample_data_processing = Supplementary_files_format_and_content: .bam: HISAT2-aligned and picard deduplicated genomic alignment .
    !Sample_data_processing = Supplementary_files_format_and_content: .stringtie.count: TSV table containing abundance of transcripts with bed-formatted coordinates. 
    !Sample_data_processing = Supplementary_files_format_and_content: RPKM.bw:  RPKM normalised bigwig files
    !Sample_supplementary_file_1 = {{sample.file_bam}}
    !Sample_supplementary_file_2 = {{sample.file_count}}
    !Sample_supplementary_file_3 = {{sample.file_bw}}
    '''
    else:
        assert sample['file_ct'] != 'NA'
        template = u'''
    !Sample_data_processing = Adapters were trimmed off using Trimmomatic with "ILLUMINACLIP:$FA_ADAPTER:2:10:5:1"
    !Sample_data_processing = The trimmed reads were aligned using Tophat with "--max-multihits --library-type fr-firststrand --no-mixed"
    !Sample_data_processing = Duplicate reads were removed with Picard using default setting
    !Sample_data_processing = Alignments in SAM file were assembled into transcripts abundances using htseq-count with "-r name -s no -f bam -t exon -i gene_id" against the GTF annotation"

    !Sample_data_processing = Supplementary_files_format_and_content: *.bam: Tophat-aligned and picard deduplicated genomic alignment .
    !Sample_data_processing = Supplementary_files_format_and_content: _htseq_count.ct: TSV table containing htseq-counted transcript abundances.
    !Sample_supplementary_file_1 = {{sample.file_bam}}
    !Sample_supplementary_file_2 = {{sample.file_ct}}
    '''

    res = pyext.jf2(template,)
    return res