def create_read_array(bamfile, index, aws_upload_key, min_poly_t, max_transcript_length): """Create or download a ReadArray object. :param max_transcript_length: :param str bamfile: filename of .bam file :param str index: directory containing index files :param str aws_upload_key: key where aws files should be uploaded :param int min_poly_t: minimum number of poly_t nucleotides for a read to be valid :returns ReadArray, UploadManager: ReadArray object, bamfile ProcessManager """ log.info('Filtering aligned records and constructing record database.') # Construct translator translator = GeneIntervals( index + 'annotations.gtf', max_transcript_length=max_transcript_length) read_array = ReadArray.from_alignment_file( bamfile, translator, min_poly_t) # converting sam to bam and uploading to S3, else removing bamfile if aws_upload_key: log.info('Uploading bam file to S3.') upload_bam = 'aws s3 mv {fname} {s3link}{prefix}_Aligned.out.bam'.format( fname=bamfile, s3link=aws_upload_key, prefix=args.output_prefix) print(upload_bam) upload_manager = io.ProcessManager(upload_bam) upload_manager.run_all() else: log.info('Removing bamfile for memory management.') rm_bamfile = 'rm %s' % bamfile io.ProcessManager(rm_bamfile).run_all() upload_manager = None return read_array, upload_manager
def align_fastq_records( merged_fastq, dir_, star_args, star_index, n_proc, aws_upload_key) -> (str, str, io.ProcessManager): """ Align fastq records. :param merged_fastq: str, path to merged .fastq file :param dir_: str, stem for output files :param star_args: dict, extra keyword arguments for STAR :param star_index: str, file path to directory containing STAR index :param n_proc: int, number of STAR processes to initiate :param aws_upload_key: str, location to upload files, or None if seqc was initiated from a merged fastq file. :return bamfile, input_data, upload_manager: (str, str, io.ProcessManager) name of .sam file containing aligned reads, indicator of which data was used as input, and a ProcessManager for merged fastq files """ log.info('Aligning merged fastq records.') alignment_directory = dir_ + '/alignments/' os.makedirs(alignment_directory, exist_ok=True) if star_args is not None: star_kwargs = dict(a.strip().split('=') for a in star_args) else: star_kwargs = {} bamfile = star.align( merged_fastq, star_index, n_proc, alignment_directory, **star_kwargs) if aws_upload_key: log.info('Gzipping merged fastq file.') if pigz: pigz_zip = "pigz --best -k -f {fname}".format(fname=merged_fastq) else: pigz_zip = "gzip -kf {fname}".format(fname=merged_fastq) pigz_proc = io.ProcessManager(pigz_zip) pigz_proc.run_all() pigz_proc.wait_until_complete() # prevents slowing down STAR alignment merged_fastq += '.gz' # reflect gzipped nature of file log.info('Uploading gzipped merged fastq file to S3.') merge_upload = 'aws s3 mv {fname} {s3link}'.format( fname=merged_fastq, s3link=aws_upload_key) upload_manager = io.ProcessManager(merge_upload) upload_manager.run_all() else: log.info('Removing merged fastq file for memory management.') rm_merged = 'rm %s' % merged_fastq io.ProcessManager(rm_merged).run_all() upload_manager = None return bamfile, upload_manager
def merge_fastq_files( technology_platform, barcode_fastq: [str], output_stem: str, genomic_fastq: [str]) -> (str, int): """annotates genomic fastq with barcode information; merging the two files. :param technology_platform: class from platforms.py that defines the characteristics of the data being processed :param barcode_fastq: list of str names of fastq files containing barcode information :param output_stem: str, stem for output files :param genomic_fastq: list of str names of fastq files containing genomic information :returns str merged_fastq: name of merged fastq file """ log.info('Merging genomic reads and barcode annotations.') merged_fastq = fastq.merge_paired( merge_function=technology_platform.merge_function, fout=output_stem + '_merged.fastq', genomic=genomic_fastq, barcode=barcode_fastq) # delete genomic/barcode fastq files after merged.fastq creation log.info('Removing original fastq file for memory management.') delete_fastq = ' '.join(['rm'] + genomic_fastq + barcode_fastq) io.ProcessManager(delete_fastq).run_all() return merged_fastq