Beispiel #1
0
    def create_read_array(bamfile, index, aws_upload_key, min_poly_t,
                          max_transcript_length):
        """Create or download a ReadArray object.

        :param max_transcript_length:
        :param str bamfile: filename of .bam file
        :param str index: directory containing index files
        :param str aws_upload_key: key where aws files should be uploaded
        :param int min_poly_t: minimum number of poly_t nucleotides for a read to be valid
        :returns ReadArray, UploadManager: ReadArray object, bamfile ProcessManager
        """
        log.info('Filtering aligned records and constructing record database.')
        # Construct translator
        translator = GeneIntervals(
            index + 'annotations.gtf', max_transcript_length=max_transcript_length)
        read_array = ReadArray.from_alignment_file(
            bamfile, translator, min_poly_t)

        # converting sam to bam and uploading to S3, else removing bamfile
        if aws_upload_key:
            log.info('Uploading bam file to S3.')
            upload_bam = 'aws s3 mv {fname} {s3link}{prefix}_Aligned.out.bam'.format(
                fname=bamfile, s3link=aws_upload_key, prefix=args.output_prefix)
            print(upload_bam)
            upload_manager = io.ProcessManager(upload_bam)
            upload_manager.run_all()
        else:
            log.info('Removing bamfile for memory management.')
            rm_bamfile = 'rm %s' % bamfile
            io.ProcessManager(rm_bamfile).run_all()
            upload_manager = None
        return read_array, upload_manager
Beispiel #2
0
    def align_fastq_records(
            merged_fastq, dir_, star_args, star_index, n_proc,
            aws_upload_key) -> (str, str, io.ProcessManager):
        """
        Align fastq records.

        :param merged_fastq: str, path to merged .fastq file
        :param dir_: str, stem for output files
        :param star_args: dict, extra keyword arguments for STAR
        :param star_index: str, file path to directory containing STAR index
        :param n_proc: int, number of STAR processes to initiate
        :param aws_upload_key: str, location to upload files, or None if seqc was
          initiated from a merged fastq file.
        :return bamfile, input_data, upload_manager: (str, str, io.ProcessManager)
          name of .sam file containing aligned reads, indicator of which data was used as
          input, and a ProcessManager for merged fastq files
        """
        log.info('Aligning merged fastq records.')
        alignment_directory = dir_ + '/alignments/'
        os.makedirs(alignment_directory, exist_ok=True)
        if star_args is not None:
            star_kwargs = dict(a.strip().split('=') for a in star_args)
        else:
            star_kwargs = {}
        bamfile = star.align(
            merged_fastq, star_index, n_proc, alignment_directory,
            **star_kwargs)

        if aws_upload_key:
            log.info('Gzipping merged fastq file.')
            if pigz:
                pigz_zip = "pigz --best -k -f {fname}".format(fname=merged_fastq)
            else:
                pigz_zip = "gzip -kf {fname}".format(fname=merged_fastq)
            pigz_proc = io.ProcessManager(pigz_zip)
            pigz_proc.run_all()
            pigz_proc.wait_until_complete()  # prevents slowing down STAR alignment
            merged_fastq += '.gz'  # reflect gzipped nature of file

            log.info('Uploading gzipped merged fastq file to S3.')
            merge_upload = 'aws s3 mv {fname} {s3link}'.format(
                fname=merged_fastq, s3link=aws_upload_key)
            upload_manager = io.ProcessManager(merge_upload)
            upload_manager.run_all()
        else:
            log.info('Removing merged fastq file for memory management.')
            rm_merged = 'rm %s' % merged_fastq
            io.ProcessManager(rm_merged).run_all()

            upload_manager = None
        return bamfile, upload_manager
Beispiel #3
0
    def merge_fastq_files(
            technology_platform, barcode_fastq: [str], output_stem: str,
            genomic_fastq: [str]) -> (str, int):
        """annotates genomic fastq with barcode information; merging the two files.

        :param technology_platform: class from platforms.py that defines the
          characteristics of the data being processed
        :param barcode_fastq: list of str names of fastq files containing barcode
          information
        :param output_stem: str, stem for output files
        :param genomic_fastq: list of str names of fastq files containing genomic
          information
        :returns str merged_fastq: name of merged fastq file
        """

        log.info('Merging genomic reads and barcode annotations.')
        merged_fastq = fastq.merge_paired(
            merge_function=technology_platform.merge_function,
            fout=output_stem + '_merged.fastq',
            genomic=genomic_fastq,
            barcode=barcode_fastq)

        # delete genomic/barcode fastq files after merged.fastq creation
        log.info('Removing original fastq file for memory management.')
        delete_fastq = ' '.join(['rm'] + genomic_fastq + barcode_fastq)
        io.ProcessManager(delete_fastq).run_all()

        return merged_fastq