def get_xml(dirname, analysis_id, logger):

    print "Downloading XML"
    print "Analysis ID = %s" % analysis_id
    xml_file = "%s.xml" %os.path.join(dirname, analysis_id)
    cmd = ['cgquery', '-o' , xml_file, 'analysis_id=%s' %analysis_id]
    pipelineUtil.log_function_time('cgquery', analysis_id, cmd, logger)

    return xml_file
def decompress(filename, workdir):
    """ Unpack fastq files """

    if filename.endswith(".tar"):
        cmd = ['tar', '-xvf', filename, '-C', workdir]
    elif filename.endswith(".gz"):
        cmd = ['tar', '-xzvf', filename, '-C', workdir]
    else:
        raise Exception('Unknown input file extension for file %s' % filename)
    pipelineUtil.log_function_time("tar", filename, cmd)
Exemple #3
0
def run_pipeline(args, workdir, analysis_id, logger):
    """ align datasets using STAR and compute expression using cufflinks """

    for filename in os.listdir(workdir):
        if filename.endswith(".tar") or filename.endswith(".tar.gz"):
            tar_file_in = os.path.join(workdir, filename)
            break

    star_output_dir = os.path.join(workdir, 'star_2_pass')
    if os.path.isdir(star_output_dir):
        pipelineUtil.remove_dir(star_output_dir)
    os.mkdir(star_output_dir)
    bam = "%s_star.bam" %os.path.join(star_output_dir, analysis_id)

    if not os.path.isfile(bam):
        star_cmd = ['time', '/usr/bin/time', 'python', args.star_pipeline,
                    '--genomeDir', args.genome_dir,
                    '--runThreadN', args.p,
                    '--tarFileIn', tar_file_in,
                    '--workDir', workdir,
                    '--out', bam,
                    '--genomeFastaFile', args.genome_fasta_file,
                    '--sjdbGTFfile', args.gtf
                   ]
        if args.quantMode != "":
            star_cmd.append('--quantMode')
            star_cmd.append(args.quantMode)

    pipelineUtil.log_function_time("STAR", analysis_id, star_cmd, logger)

    remote_bam_path = "%s_star.bam" % os.path.join(args.bucket, analysis_id, analysis_id)
    pipelineUtil.upload_to_cleversafe(logger, remote_bam_path, bam)


    cufflinks_cmd = ['time', '/usr/bin/time', 'python', args.cufflinks_pipeline,
                     '--bam', bam,
                     '--gtf', args.gtf,
                     '--analysis_id', analysis_id,
                     '--out', star_output_dir,
                     '--p', args.p,
                     '--multi_read_correct', 'True'
                    ]

    pipelineUtil.log_function_time("CUFFLINKS", analysis_id, cufflinks_cmd, logger)

    cuffout_genes_local = os.path.join(star_output_dir, "genes.fpkm_tracking")
    cuffout_genes_remote = os.path.join(args.bucket, "cufflinks", "star_gene", "%s.genes.fpkm_tracking" %analysis_id)
    pipelineUtil.upload_to_cleversafe(logger, cuffout_genes_remote, cuffout_genes_local)

    cuffout_isoforms_local = os.path.join(star_output_dir, "isoforms.fpkm_tracking")
    cuffout_isoforms_remote = os.path.join(args.bucket, "cufflinks", "star_iso", "%s.isoforms.fpkm_tracking" %analysis_id)
    pipelineUtil.upload_to_cleversafe(logger, cuffout_isoforms_remote, cuffout_isoforms_local)

    pipelineUtil.remove_dir(star_output_dir)
def cufflinks_compute(args, logger=None):
    """ compute rna-seq expression using cufflinks """

    cmd = ['cufflinks']

    if args.multi_read_correct == 'True':
        cmd.append('--multi-read-correct')
    if args.frag_bias_correct == 'True':
        cmd.append('--frag-bias-correct')

    cmd += [
            '--GTF', args.gtf,
            '--output-dir', args.out,
            '--num-threads', str(args.p),
            args.bam
          ]
    print cmd
    pipelineUtil.log_function_time('cufflinks', args.analysis_id, cmd, logger)
def rna_seq_qc(rna_seq_qc_path, bam_file, uuid, outdir, ref_genome, gtf, logger=None):
    """ Perform RNA-seqQC on post alignment BAM file """

    if os.path.isfile(bam_file) and os.path.isfile(rna_seq_qc_path) and os.path.isfile(gtf):
        cmd = ['java', '-jar', rna_seq_qc_path, '-o', outdir, '-r', ref_genome, '-s',
                '%s|%s|%s' %(uuid, bam_file, uuid), '-t', gtf]
        exit_code = pipelineUtil.log_function_time('RNAseq_qc', uuid, cmd, logger)
    else:
        raise Exception("Cannot find one of  rnaseq-qc %s, bam %s or gtf %s" %(rna_seq_qc_path, bam_file, gtf))

    if not exit_code == 0:
       if not logger == None:
            logger.error("Broad's RNA-Seq-QC returned non-zero exit code %s" %exit_code)
    return exit_code
def bam_index(bam_file, uuid, logger=None):
    """ Index the resultant post alignment BAM file """

    if os.path.isfile(bam_file):
        cmd = ['samtools', 'index', '-b', bam_file]
        exit_code = pipelineUtil.log_function_time("BamIndex", uuid, cmd, logger)
        if exit_code == 0:
            assert(os.path.isfile('%s.bai' %bam_file))
    else:
        raise Exception("Cannot file bam file  %s" %bam_file)

    if not exit_code == 0:
       if not logger == None:
            logger.error("Samtools Index returned non-zero exit code %s" %exit_code)
    return exit_code
Exemple #7
0
def fastqc(fastqc_path, reads_1, reads_2, rg_id_dir, analysis_id, logger=None):
    """ perform pre-alignment qc checks using fastqc """

    if not os.path.isdir(rg_id_dir):
        raise Exception("Invalid directory: %s")

    fastqc_results = "%s" %(os.path.join(rg_id_dir, "fastqc_results"))
    if not os.path.isdir(fastqc_results):
        os.mkdir(fastqc_results)
    if not reads_2 == "":
        cmd = [fastqc_path, reads_1, reads_2, '--outdir', fastqc_results, '--extract']
    else:
        cmd = [fastqc_path, reads_1, '--outdir', fastqc_results, '--extract']
    exit_code = pipelineUtil.log_function_time("FastQC", analysis_id, cmd, logger)
    if not exit_code == 0:
        if not logger == None:
            logger.error('FastQC returned a non-zero exit code: %s' %exit_code)
def bam_to_fastq(fastq_dir, bam_file, analysis_id, logger=None):
    """ Convert input BAM to Fastq files """

    tmp_fastq = os.path.join(fastq_dir, 'tmp')

    cmd = ['bamtofastq', 'filename=%s' %bam_file, 'outputdir=%s' %fastq_dir,
            'tryoq=1', 'collate=1', 'outputperreadgroup=1', 'T=%s' %tmp_fastq]

    exit_code = pipelineUtil.log_function_time('Biobambam', analysis_id, cmd, logger)

    if exit_code == 0:
        for filename in os.listdir(fastq_dir):
            if filename.endswith(".fq"):
                new_filename = filename.replace(".fq", ".fastq")
                os.rename(os.path.join(fastq_dir, filename), os.path.join(fastq_dir, new_filename))
    else:
        logger.error("Biobambam BamToFastq conversion of %s returned a non-zero exit code %s"
                    %(analysis_id, exit_code))
def reorder_bam(picard_path, bam_file, uuid, outdir, ref_genome, logger=None):
    """ Reorder the BAM file according to the reference genome """

    if os.path.isfile(bam_file) and os.path.isfile(picard_path) and os.path.isfile(ref_genome):
        outbam = os.path.join(outdir, '%s.reorder.bam' %uuid)
        tmp_dir = os.path.join(outdir, 'tmp')
        if not os.path.isdir(tmp_dir):
            os.mkdir(tmp_dir)
        cmd = ['java', '-jar', picard_path, 'ReorderSam', 'I=%s' %bam_file, 'O=%s' %outbam, 'R=%s' %ref_genome,
                'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=%s' %tmp_dir]
        exit_code = pipelineUtil.log_function_time("picard_reorder_sam", uuid, cmd, logger)
        if exit_code == 0:
            assert(os.path.isfile(outbam))
    else:
        raise Exception("Cannot find one of bam %s, picard path %s or reference genome %s" %(bam_file, picard_path, ref_genome))

    if not exit_code == 0:
       if not logger == None:
            logger.error("Picard reorderBAM returned non-zero exit code %s" %exit_code)
    return outbam
def collect_rna_seq_metrics(picard_path, bam_file, uuid, outdir, ref_flat, logger=None):
    """ Collect RNA-seq metrics using Picard """

    if os.path.isfile(picard_path) and os.path.isfile(bam_file):
        tmp_dir = os.path.join(outdir, 'tmp')
        outfile = os.path.join(outdir, "%s.rna_seq_metrics.txt" %uuid)
        if not os.path.isdir(tmp_dir):
            os.mkdir(tmp_dir)
        cmd = ['java', '-jar', picard_path, "CollectRnaSeqMetrics", "METRIC_ACCUMULATION_LEVEL=READ_GROUP",
                "I=%s" %bam_file, "O=%s" %outfile, "STRAND=NONE",
                "REF_FLAT=%s" %ref_flat, "VALIDATION_STRINGENCY=LENIENT", "TMP_DIR=%s" %tmp_dir]
        exit_code = pipelineUtil.log_function_time("RNAseq_metrics", uuid, cmd, logger)
        if exit_code == 0:
            assert(os.path.isfile(outfile))
    else:
        raise Exception("Invalid path to picard or bam")

    if not exit_code == 0:
       if not logger == None:
            logger.error("Picard CollectRnaSeqMetrics returned non-zero exit code %s" %exit_code)
    return exit_code
def validate_bam_file(picard_path, bam_file, uuid, outdir, logger=None):
   """ Validate resulting post-alignment BAM file """

   if os.path.isfile(picard_path) and os.path.isfile(bam_file):
        outfile = os.path.join(outdir, "%s.validate" %uuid)
        tmp_dir = os.path.join(outdir, 'tmp')
        if not os.path.isdir(tmp_dir):
            os.mkdir(tmp_dir)
        cmd = ['java', '-jar', picard_path, "ValidateSamFile", "I=%s" %bam_file,
               "O=%s" %outfile, "VALIDATION_STRINGENCY=LENIENT",
               "TMP_DIR=%s" %tmp_dir]
        exit_code = pipelineUtil.log_function_time("ValidateSAM", uuid, cmd, logger)
        if exit_code == 0:
            assert(os.path.isfile(outfile))
   else:
       raise Exception("Invalid path to picard or BAM")

   if not exit_code == 0:
       if not logger == None:
            logger.error("Picard ValidateSamFile returned non-zero exit code %s" %exit_code)

   return exit_code
def add_or_replace_read_group(picard_path, bam_file,  outdir, uuid, rg_id, rg_lb="Unknown", rg_pl="Unknown", rg_pu="Unknown",rg_sm="Unknown", logger=None):
    """ Replace the @RG tag in the reads and header """

    outbam = '%s.addRG.bam' %os.path.join(outdir, uuid)
    if os.path.isfile(bam_file) and os.path.isfile(picard_path):
        tmp_dir = os.path.join(outdir, 'tmp')
        if not os.path.isdir(tmp_dir):
            os.mkdir(tmp_dir)
        cmd = ['java', '-jar', picard_path, 'AddOrReplaceReadGroups', 'I=%s' %bam_file, 'O=%s' %outbam,
                'RGID=%s'%rg_id, 'RGLB=%s' %rg_lb, 'RGPL=%s' %rg_pl, 'RGPU=%s' %rg_pu, 'RGSM=%s' %rg_sm,
                'VALIDATION_STRINGENCY=LENIENT','TMP_DIR=%s' %tmp_dir]
        exit_code = pipelineUtil.log_function_time('AddOrReplaceReadGroups', uuid, cmd, logger)
    else:
        raise Exception("Cannot find bam file %s or path to picard %s" %(bam_file, picard_path))

    if not exit_code == 0:
       if not logger == None:
            logger.error("Picard AddOrReplaceReadGroups returned non-zero exit code %s" %exit_code)

    if os.path.isfile(outbam):
        print "returning file now %s" %outbam
        return outbam
    else:
        raise Exception('Could not add or replace read groups. Check log file for errors')
            log_file = os.path.join(sub_dir, log_file)
            logp = open(log_file, "r")
            for line in logp:
                if "CUFFLINKS_TIME" in line:
                    line = line.split()
                    f.write("%s\t%s\t%s\n" %(line[4], line[5], metadata["downloadable_file_size"]))
    f.close()
if __name__ == "__main__":
    parser = argparse.ArgumentParser(prog='label_dataset.py')
    parser.add_argument('--dirname', default='/home/ubuntu/SCRATCH/lung_results')
    parser.add_argument('--disease', help='disease to be labeled')
    args = parser.parse_args()
    #collect_metrics(args.dirname)


    for filename in os.listdir(args.dirname):
        if filename.endswith('fpkm_tracking'):
            analysis_id = filename.split(".")[0]
            #metadata = extract_metadata(args.dirname, analysis_id, None)
            #print 'disease= %s' %metadata['disease']

            cmd = ['mv', '%s' % os.path.join(args.dirname, filename), '%s' %os.path.join(args.dirname, args.disease, '%s_%s' %(args.disease, analysis_id))]
            """
            print metadata
            if metadata['disease'] != "":
                if not os.path.isdir(os.path.join(args.dirname, metadata['disease'])):
                    os.mkdir(os.path.join(args.dirname, metadata['disease']))
            cmd = ['mv', '%s' % os.path.join(args.dirname, filename), '%s' %os.path.join(args.dirname, metadata["disease"], '%s_%s' %(metadata['disease'], analysis_id))]
            """
            pipelineUtil.log_function_time('mv', analysis_id, cmd, None)
def run_pipeline(args, workdir, analysis_id, fastq_dir, logger):
    """ align datasets using STAR and compute expression using cufflinks """

    tar_file_in = args.input_file

    qc_dir = os.path.join(workdir, 'qc')
    if not os.path.isdir(qc_dir):
        os.mkdir(qc_dir)

    decompress(tar_file_in, fastq_dir)
    for fname in os.listdir(fastq_dir):
        if fname.endswith("_1.fastq.gz") or fname.endswith("_1.fastq"):
            reads_1 = os.path.join(fastq_dir, fname)
        if fname.endswith("_2.fastq.gz") or fname.endswith("_2.fastq"):
            reads_2 = os.path.join(fastq_dir, fname)
    qc.fastqc(args.fastqc_path, reads_1, reads_2, qc_dir, analysis_id, logger)

    star_output_dir = os.path.join(workdir, 'star_2_pass')
    if os.path.isdir(star_output_dir):
        pipelineUtil.remove_dir(star_output_dir)
    os.mkdir(star_output_dir)
    bam = "%s_star.bam" %os.path.join(star_output_dir, analysis_id)

    if not os.path.isfile(bam):
        star_cmd = ['time', '/usr/bin/time', 'python', args.star_pipeline,
                    '--genomeDir', args.genome_dir,
                    '--runThreadN', args.p,
                    '--tarFileIn', tar_file_in,
                    '--workDir', workdir,
                    '--out', bam,
                    '--genomeFastaFile', args.genome_fasta_file,
                    '--sjdbGTFfile', args.gtf
                   ]
        if args.quantMode != "":
            star_cmd.append('--quantMode')
            star_cmd.append(args.quantMode)

    pipelineUtil.log_function_time("STAR", analysis_id, star_cmd, logger)

    exit_code = 1
    #Fix mate information for BAM
    exit_code, fix_mate_out = post_alignment_qc.fix_mate_information(args.picard, bam,
                                                                    analysis_id, workdir, logger)
    if exit_code == 0:
        os.remove(bam)
        assert(not os.path.isfile(bam))
        os.rename(fix_mate_out, bam)
        assert(os.path.isfile(bam))

    #validate the post alignment BAM file
    post_alignment_qc.validate_bam_file(args.picard, bam, analysis_id, qc_dir, logger)

    #collect RNA-seq metrics
    post_alignment_qc.collect_rna_seq_metrics(args.picard, bam, analysis_id,
                                                qc_dir, args.ref_flat, logger)

    #quantify using cufflinks
    cufflinks_cmd = ['time', '/usr/bin/time', 'python', args.cufflinks_pipeline,
                     '--bam', bam,
                     '--gtf', args.gtf,
                     '--analysis_id', analysis_id,
                     '--out', star_output_dir,
                     '--p', args.p,
                     '--multi_read_correct', 'True'
                    ]

    pipelineUtil.log_function_time("CUFFLINKS", analysis_id, cufflinks_cmd, logger)
Exemple #15
0
            "--outSAMstrandField", str(args.outSAMstrandField),
            "--outSAMunmapped", str(args.outSAMunmapped)
            ]

    if args.keepJunctions:
        cmd = cmd.append("--keepJunctions")
        cmd = cmd.append(str(args.keepJunctions))

    if not args.metaDataTab == None:
        cmd = cmd.append("--metaDataTab")
        cmd = cmd.append(str(args.metaDataTab))

    if not args.outSAMattrRGline == None:
        cmd = cmd.append("--outSAMattrRGline")
        cmd = cmd.append(str(args.outSAMattrRGline))

    if not args.outSAMattrRGfile == None:
        cmd = cmd.append("--outSAMattrRGfile")
        cmd = cmd.append(str(args.outSAMattrRGfile))

    logger.info('Starting Alignment with STAR')

    exit_code = pipelineUtil.log_function_time("STAR_ALIGN", args.id, cmd, logger)
    if exit_code == 0:
        logger.info('Starting post alignment QC')
        post_aln_qc(args, args.out, logger)
    else:
        logger.error('STAR returned a non-zero exit code %s' %exit_code)