def merge_fastq_files( technology_platform, barcode_fastq: [str], output_stem: str, genomic_fastq: [str]) -> (str, int): """annotates genomic fastq with barcode information; merging the two files. :param technology_platform: class from platforms.py that defines the characteristics of the data being processed :param barcode_fastq: list of str names of fastq files containing barcode information :param output_stem: str, stem for output files :param genomic_fastq: list of str names of fastq files containing genomic information :returns str merged_fastq: name of merged fastq file """ log.info('Merging genomic reads and barcode annotations.') merged_fastq = fastq.merge_paired( merge_function=technology_platform.merge_function, fout=output_stem + '_merged.fastq', genomic=genomic_fastq, barcode=barcode_fastq) # delete genomic/barcode fastq files after merged.fastq creation log.info('Removing original fastq file for memory management.') delete_fastq = ' '.join(['rm'] + genomic_fastq + barcode_fastq) io.ProcessManager(delete_fastq).run_all() return merged_fastq
def __exit__(self, exc_type, exc_val, exc_tb): """If an exception occurs, log the exception, email if possible, then terminate the aws instance if requested by the user :param exc_type: type of exception encountered :param exc_val: value of exception :param exc_tb: exception traceback """ # log any exceptions, set email body based on error / terminate status if exc_type is not None: log.exception() email_body = 'Process interrupted -- see attached error message' elif self.terminate: email_body = 'Process completed successfully -- see attached log' log.info( 'Execution completed successfully, instance will be terminated.' ) else: email_body = 'Process completed successfully -- see attached log' log.info('Execution completed successfully, but user requested no ' 'termination. Instance will continue to run.') # todo this is the source of the second email for successful runs # email user if possible; catch exceptions if email fails. if self.email and self.mutt: log.notify('Emailing user.') try: self.email_user(attachment=self.log_name, email_body=email_body, email_address=self.email) except ChildProcessError: log.exception() # upload data if requested if self.aws_upload_key: log.notify('Uploading log to {}'.format(self.aws_upload_key)) bucket, key = io.S3.split_link(self.aws_upload_key) @Retry(catch=Exception) def upload_file(): io.S3.upload_file(self.log_name, bucket, key) upload_file() # terminate if no errors and debug is False if self.terminate: if exc_type and self.debug: return # don't terminate if an error was raised and debug was set instance_id = self._get_instance_id() if instance_id is None: return # todo notify if verbose ec2 = boto3.resource('ec2') instance = ec2.Instance(instance_id) log.notify( 'instance %s termination requested. If successful, this is the ' 'final log entry.' % instance_id) instance.terminate() instance.wait_until_terminated()
def _fileidentity_from_ls(cls, line): line = line.strip().split() try: return cls._FileIdentity(line[8], line[4], line[5], line[6].rpartition('.')[0]) except: log.info(repr(line)) raise
def apply_rmt_correction(self, ra, error_rate): """ Apply RMT correction :param ra: Read array :param error_rate: Error rate table from apply_barcode_correction """ log.info('Mars-seq barcodes do not support RMT correction')
def put_file(self, local_file, remote_file): """places a file from the local machine onto a remote instance :param local_file: name of file to be copied to remote instance :param remote_file: name of file placed remotely """ if not self.is_connected(): self.connect() with closing(self.ssh.open_sftp()) as ftp: ftp.put(local_file, remote_file) log.info('placed {lfile} at {rfile}.'.format(lfile=local_file, rfile=remote_file))
def create_index( self, ensemble_release: int, read_length: int, valid_biotypes=("protein_coding", "lincRNA"), s3_location: str = None, ): """create an optionally upload an index :param valid_biotypes: gene biotypes that do not match values in this list will be discarded from the annotation and will not appear in final count matrices :param s3_location: optional, s3 location to upload the index to. :return: """ log.info("Downloading Ensemble files...") self._download_ensembl_files(ensemble_release) log.info("Subsetting genes...") self._subset_genes(valid_biotypes=valid_biotypes) log.info("Creating STAR index...") self._create_star_index(read_length=read_length) if s3_location: log.info("Uploading...") self._upload_index( "%s/%s" % (self.index_folder_name, self.organism), s3_location)
def download(cls, platform, sample_id: str, dest_path: str, access_token: str = None) -> (list, list): """ Downloads all files related to a sample from the basespace API :param platform: the type of data that is being downloaded :param sample_id: The sample id, taken directory from the basespace link for a sample (experiment). e.g. if the link is: "https://basespace.illumina.com/sample/30826030/Day0-ligation-11-17", then the sample_id is "30826030" :param access_token: a string access token that allows permission to access the ILLUMINA BaseSpace server and download the requested data. Access tokens can be obtained by (1) logging into https://developer.basespace.illumina.com, (2), creating a "new application", and (3) going to the credentials tab of that application to obtain the access token. :param dest_path: the location that the downloaded files should be placed. :returns: (list, list), forward, reverse: lists of fastq files """ # validity of response will already have been checked response = requests.get( 'https://api.basespace.illumina.com/v1pre3/samples/' + sample_id + '/files?Extensions=gz&access_token=' + access_token) data = response.json() func = partial(cls._download_basespace_content, data['Response']['Items'], access_token, dest_path) log.info( 'BaseSpace API link provided, downloading files from BaseSpace.') with Pool(len(data['Response']['Items'])) as pool: pool.map(func, range(len(data['Response']['Items']))) # get downloaded forward and reverse fastq files filenames = [f['Name'] for f in data['Response']['Items']] # fixed location for how BaseSpace installs files dest_path += '/Data/Intensities/BaseCalls/' if 'mars' not in platform: barcode_fastq = [f for f in filenames if '_R1_' in f] genomic_fastq = [f for f in filenames if '_R2_' in f] else: genomic_fastq = [f for f in filenames if '_R1_' in f] barcode_fastq = [f for f in filenames if '_R2_' in f] return sorted(barcode_fastq), sorted(genomic_fastq)
def merge_fastq_files( technology_platform, barcode_fastq: [str], output_stem: str, genomic_fastq: [str], ) -> (str, int): """annotates genomic fastq with barcode information; merging the two files. :param technology_platform: class from platforms.py that defines the characteristics of the data being processed :param barcode_fastq: list of str names of fastq files containing barcode information :param output_stem: str, stem for output files :param genomic_fastq: list of str names of fastq files containing genomic information :returns str merged_fastq: name of merged fastq file """ # hack: # Due to the non-platform agnostic glob behavior, # it is possible that L001_R1 is merged with L002_R2 (not L001_R2). # to avoid this problem, we first sort. # this is a temporary hacky solution barcode_fastq = sorted(barcode_fastq) genomic_fastq = sorted(genomic_fastq) log.info("Merging genomic reads and barcode annotations.") for bar_fq, gen_fq in zip(barcode_fastq, genomic_fastq): log.info("Merge {} with {}".format(os.path.basename(bar_fq), os.path.basename(gen_fq))) merged_fastq = fastq.merge_paired( merge_function=technology_platform.merge_function, fout=output_stem + "_merged.fastq", genomic=genomic_fastq, barcode=barcode_fastq, ) # delete genomic/barcode fastq files after merged.fastq creation # log.info('Removing original fastq file for memory management.') # delete_fastq = ' '.join(['rm'] + genomic_fastq + barcode_fastq) # io.ProcessManager(delete_fastq).run_all() return merged_fastq
def _download_gtf_file(self, ftp, download_name: str, ensemble_release: int) -> None: """download the gtf file for cls.organism from ftp, an open Ensembl FTP server :param FTP ftp: open FTP link to ENSEMBL :param str download_name: filename for downloaded gtf file """ release_num = (ensemble_release if ensemble_release else self._identify_newest_release(ftp)) work_dir = "/pub/release-%d/gtf/%s/" % (release_num, self.organism) ftp.cwd(work_dir) ensembl_gtf_filename = self._identify_gtf_file(ftp.nlst(), release_num) log.info("GTF Ensemble Release {}".format(release_num)) log.info("ftp://{}{}".format( ftp.host, os.path.join(work_dir, ensembl_gtf_filename))) with open(download_name, "wb") as f: ftp.retrbinary("RETR %s" % ensembl_gtf_filename, f.write)
def align_fastq_records( merged_fastq, dir_, star_args, star_index, n_proc, aws_upload_key) -> (str, str, io.ProcessManager): """ Align fastq records. :param merged_fastq: str, path to merged .fastq file :param dir_: str, stem for output files :param star_args: dict, extra keyword arguments for STAR :param star_index: str, file path to directory containing STAR index :param n_proc: int, number of STAR processes to initiate :param aws_upload_key: str, location to upload files, or None if seqc was initiated from a merged fastq file. :return bamfile, input_data, upload_manager: (str, str, io.ProcessManager) name of .sam file containing aligned reads, indicator of which data was used as input, and a ProcessManager for merged fastq files """ log.info('Aligning merged fastq records.') alignment_directory = dir_ + '/alignments/' os.makedirs(alignment_directory, exist_ok=True) if star_args is not None: star_kwargs = dict(a.strip().split('=') for a in star_args) else: star_kwargs = {} bamfile = star.align( merged_fastq, star_index, n_proc, alignment_directory, **star_kwargs) if aws_upload_key: log.info('Gzipping merged fastq file.') if pigz: pigz_zip = "pigz --best -k -f {fname}".format(fname=merged_fastq) else: pigz_zip = "gzip -kf {fname}".format(fname=merged_fastq) pigz_proc = io.ProcessManager(pigz_zip) pigz_proc.run_all() pigz_proc.wait_until_complete() # prevents slowing down STAR alignment merged_fastq += '.gz' # reflect gzipped nature of file log.info('Uploading gzipped merged fastq file to S3.') merge_upload = 'aws s3 mv {fname} {s3link}'.format( fname=merged_fastq, s3link=aws_upload_key) upload_manager = io.ProcessManager(merge_upload) upload_manager.run_all() else: log.info('Removing merged fastq file for memory management.') rm_merged = 'rm %s' % merged_fastq io.ProcessManager(rm_merged).run_all() upload_manager = None return bamfile, upload_manager
def _download_fasta_file(self, ftp: FTP, download_name: str, ensemble_release: int) -> None: """download the fasta file for cls.organism from ftp, an open Ensembl FTP server :param FTP ftp: open FTP link to ENSEMBL :param str download_name: filename for downloaded fasta file """ release_num = (ensemble_release if ensemble_release else self._identify_newest_release(ftp)) work_dir = "/pub/release-%d/fasta/%s/dna" % (release_num, self.organism) ftp.cwd(work_dir) ensembl_fasta_filename = self._identify_genome_file(ftp.nlst()) log.info("FASTA Ensemble Release {}".format(release_num)) log.info("ftp://{}{}/{}".format(ftp.host, work_dir, ensembl_fasta_filename)) with open(download_name, "wb") as f: ftp.retrbinary("RETR %s" % ensembl_fasta_filename, f.write)
def create_read_array(bamfile, index, aws_upload_key, min_poly_t, max_transcript_length): """Create or download a ReadArray object. :param max_transcript_length: :param str bamfile: filename of .bam file :param str index: directory containing index files :param str aws_upload_key: key where aws files should be uploaded :param int min_poly_t: minimum number of poly_t nucleotides for a read to be valid :returns ReadArray, UploadManager: ReadArray object, bamfile ProcessManager """ log.info('Filtering aligned records and constructing record database.') # Construct translator translator = GeneIntervals( index + 'annotations.gtf', max_transcript_length=max_transcript_length) read_array = ReadArray.from_alignment_file( bamfile, translator, min_poly_t) # converting sam to bam and uploading to S3, else removing bamfile if aws_upload_key: log.info('Uploading bam file to S3.') upload_bam = 'aws s3 mv {fname} {s3link}{prefix}_Aligned.out.bam'.format( fname=bamfile, s3link=aws_upload_key, prefix=args.output_prefix) print(upload_bam) upload_manager = io.ProcessManager(upload_bam) upload_manager.run_all() else: log.info('Removing bamfile for memory management.') rm_bamfile = 'rm %s' % bamfile io.ProcessManager(rm_bamfile).run_all() upload_manager = None return read_array, upload_manager
def notebook(args): if args.subsubparser_name == 'merge': # need to also take a output directory because this thing will write stuff. # then merge the things # then return? n = Notebook(args.output_filename, *args.input_data) n.merge_data(merged_sample_name=args.output_filename) log.info('Merged samples written to %s' % args.input_data) elif args.subsubparser_name == 'generate': n = Notebook(args.output_stem, args.input_count_matrix) n.write_template() log.info('Notebook Template written to %s' % n.notebook_path) n.run_notebook() log.info('Notebook Run and written to %s' % n.notebook_path)
def run(args) -> None: """Run SEQC on the files provided in args, given specifications provided on the command line :param args: parsed argv, produced by seqc.parser(). This function is only called when args.subprocess_name is "run". """ # import inside module for pickle functionality # top 2 only needed for post-filtering import os import multiprocessing from seqc import log, ec2, platforms, io, version from seqc.sequence import fastq from seqc.alignment import star from seqc.alignment import sam from seqc.email_ import email_user from seqc.read_array import ReadArray from seqc.core import verify, download from seqc import filter from seqc.sequence.gtf import GeneIntervals from seqc.summary.summary import Section, Summary import numpy as np import scipy.io from shutil import copyfile from shutil import move as movefile from seqc.summary.summary import MiniSummary from seqc.stats.mast import run_mast import logging import pickle import pendulum # logger = logging.getLogger('weasyprint') # logger.handlers = [] # Remove the default stderr handler # logger.setLevel(100) # logger.addHandler(logging.FileHandler('weasyprint.log')) def determine_start_point(arguments) -> (bool, bool, bool): """ determine where seqc should start based on which parameters were passed. :param arguments: Namespace object, result of ArgumentParser.parse_args() :returns merge, align, process_bamfile: indicates whether merging, alignment, and processing bamfiles should be executed. """ if arguments.read_array: return False, False, False if arguments.alignment_file: return False, False, True if arguments.merged_fastq: return False, True, True else: return True, True, True def download_input(dir_, arguments): """parse input arguments and download any necessary data :param str dir_: directory to download data to :param arguments: namespace object from argparse :return args: updated namespace object reflecting local file paths of downloaded files """ # download basespace data if necessary if arguments.basespace: arguments.barcode_fastq, arguments.genomic_fastq = io.BaseSpace.download( arguments.platform, arguments.basespace, dir_, arguments.basespace_token) # get a list of input FASTQ files # download from AWS S3 if the URI is prefixed with s3:// arguments.genomic_fastq = download.s3_data(arguments.genomic_fastq, dir_ + "/genomic_fastq/") arguments.barcode_fastq = download.s3_data(arguments.barcode_fastq, dir_ + "/barcode_fastq/") # get merged fastq file, unzip if necessary arguments.merged_fastq = (download.s3_data( [arguments.merged_fastq], dir_ + "/")[0] if arguments.merged_fastq is not None else None) # get a path to the STAR index files # download from AWS S3 if the URI is prefixed with s3:// if any((arguments.alignment_file, arguments.read_array)): index_link = arguments.index + "annotations.gtf" else: index_link = arguments.index index_files = download.s3_data([index_link], dir_ + "/index/") # use the first filename in the list to get the index directory # add a trailing slash to make the rest of the code not break;; # e.g. test-data/index/chrStart.txt --> test-data/index/ arguments.index = os.path.dirname(index_files[0]) + "/" # get a list of whitelisted barcodes files # download from AWS S3 if the URI is prefixed with s3:// arguments.barcode_files = download.s3_data(arguments.barcode_files, dir_ + "/barcodes/") # check if `alignment_file` is specified if arguments.alignment_file: # get the alignment filename (*.bam) # download from AWS S3 if the URI is prefixed with s3:// arguments.alignment_file = download.s3_data( [arguments.alignment_file], dir_ + "/")[0] # check if `read_array` is specified if arguments.read_array: # get the readarray fileanem (*.h5) # download from AWS S3 if the URI is prefixed with s3:// arguments.read_array = download.s3_data([arguments.read_array], dir_ + "/")[0] return arguments def merge_fastq_files( technology_platform, barcode_fastq: [str], output_stem: str, genomic_fastq: [str], ) -> (str, int): """annotates genomic fastq with barcode information; merging the two files. :param technology_platform: class from platforms.py that defines the characteristics of the data being processed :param barcode_fastq: list of str names of fastq files containing barcode information :param output_stem: str, stem for output files :param genomic_fastq: list of str names of fastq files containing genomic information :returns str merged_fastq: name of merged fastq file """ # hack: # Due to the non-platform agnostic glob behavior, # it is possible that L001_R1 is merged with L002_R2 (not L001_R2). # to avoid this problem, we first sort. # this is a temporary hacky solution barcode_fastq = sorted(barcode_fastq) genomic_fastq = sorted(genomic_fastq) log.info("Merging genomic reads and barcode annotations.") for bar_fq, gen_fq in zip(barcode_fastq, genomic_fastq): log.info("Merge {} with {}".format(os.path.basename(bar_fq), os.path.basename(gen_fq))) merged_fastq = fastq.merge_paired( merge_function=technology_platform.merge_function, fout=output_stem + "_merged.fastq", genomic=genomic_fastq, barcode=barcode_fastq, ) # delete genomic/barcode fastq files after merged.fastq creation # log.info('Removing original fastq file for memory management.') # delete_fastq = ' '.join(['rm'] + genomic_fastq + barcode_fastq) # io.ProcessManager(delete_fastq).run_all() return merged_fastq def align_fastq_records(merged_fastq, dir_, star_args, star_index, n_proc, aws_upload_key) -> (str, str, io.ProcessManager): """ Align fastq records. :param merged_fastq: str, path to merged .fastq file :param dir_: str, stem for output files :param star_args: dict, extra keyword arguments for STAR :param star_index: str, file path to directory containing STAR index :param n_proc: int, number of STAR processes to initiate :param aws_upload_key: str, location to upload files, or None if seqc was initiated from a merged fastq file. :return bamfile, input_data, upload_manager: (str, str, io.ProcessManager) name of .sam file containing aligned reads, indicator of which data was used as input, and a ProcessManager for merged fastq files """ log.info("Aligning merged fastq records.") alignment_directory = dir_ + "/alignments/" os.makedirs(alignment_directory, exist_ok=True) if star_args is not None: star_kwargs = dict(a.strip().split("=") for a in star_args) else: star_kwargs = {} bamfile = star.align(merged_fastq, star_index, n_proc, alignment_directory, **star_kwargs) log.info("Gzipping merged fastq file.") if pigz: pigz_zip = "pigz --best -f {fname}".format(fname=merged_fastq) else: pigz_zip = "gzip -f {fname}".format(fname=merged_fastq) pigz_proc = io.ProcessManager(pigz_zip) pigz_proc.run_all() pigz_proc.wait_until_complete() # prevents slowing down STAR alignment merged_fastq += ".gz" # reflect gzipped nature of file if aws_upload_key: log.info("Uploading gzipped merged fastq file to S3.") merge_upload = "aws s3 mv {fname} {s3link}".format( fname=merged_fastq, s3link=aws_upload_key) upload_manager = io.ProcessManager(merge_upload) upload_manager.run_all() else: # log.info('Removing merged fastq file for memory management.') # rm_merged = 'rm %s' % merged_fastq # io.ProcessManager(rm_merged).run_all() upload_manager = None return bamfile, upload_manager def create_read_array(bamfile, index, aws_upload_key, min_poly_t, max_transcript_length): """Create or download a ReadArray object. :param max_transcript_length: :param str bamfile: filename of .bam file :param str index: directory containing index files :param str aws_upload_key: key where aws files should be uploaded :param int min_poly_t: minimum number of poly_t nucleotides for a read to be valid :returns ReadArray, UploadManager: ReadArray object, bamfile ProcessManager """ log.info("Filtering aligned records and constructing record database.") # Construct translator translator = GeneIntervals(index + "annotations.gtf", max_transcript_length=max_transcript_length) read_array, read_names = ReadArray.from_alignment_file( bamfile, translator, min_poly_t) # converting sam to bam and uploading to S3, else removing bamfile if aws_upload_key: log.info("Uploading bam file to S3.") upload_bam = "aws s3 mv {fname} {s3link}{prefix}_Aligned.out.bam".format( fname=bamfile, s3link=aws_upload_key, prefix=args.output_prefix) print(upload_bam) upload_manager = io.ProcessManager(upload_bam) upload_manager.run_all() else: if os.path.exists(bamfile): movefile(bamfile, args.output_prefix + "_Aligned.out.bam") # log.info('Removing bamfile for memory management.') # rm_bamfile = 'rm %s' % bamfile # io.ProcessManager(rm_bamfile).run_all() upload_manager = None return read_array, upload_manager, read_names # ######################## MAIN FUNCTION BEGINS HERE ################################ log.setup_logger(args.log_name, args.debug) with ec2.instance_clean_up( email=args.email, upload=args.upload_prefix, log_name=args.log_name, debug=args.debug, terminate=args.terminate, running_remote=args.remote, ): start_run_time = pendulum.now() log.notify("SEQC=v{}".format(version.__version__)) log.notify("STAR=v{}".format(star.get_version())) log.notify("samtools=v{}".format(sam.get_version())) pigz, mutt = verify.executables("pigz", "mutt") if mutt: log.notify( "mutt executable identified, email will be sent when run " "terminates. ") else: log.notify( "mutt was not found on this machine; an email will not be sent to " "the user upon termination of SEQC run.") # turn off lower coverage filter for 10x if ((args.platform == "ten_x") or (args.platform == "ten_x_v2") or (args.platform == "ten_x_v3")): args.filter_low_coverage = False if args.platform == "ten_x_v2" or args.platform == "ten_x_v3": log.notify("Setting min_poly_t=0 for 10x v2 & v3") args.min_poly_t = 0 max_insert_size = args.max_insert_size if args.filter_mode == "scRNA-seq": # for scRNA-seq if ((args.platform == "ten_x") or (args.platform == "ten_x_v2") or (args.platform == "ten_x_v3")): # set max_transcript_length (max_insert_size) = 10000 max_insert_size = 10000 log.notify( "Full length transcripts are used for read mapping in 10x data." ) elif args.filter_mode == "snRNA-seq": # for snRNA-seq # e.g. 2304700 # hg38 # e.g. 4434881 # mm38 max_insert_size = args.max_insert_size else: # all others max_insert_size = args.max_insert_size log.notify("max_insert_size is set to {}".format(max_insert_size)) log.args(args) # e.g. # --output-prefix=test-data/_outs/test # output_dir=test-data # output_prefix=test output_dir, output_prefix = os.path.split(args.output_prefix) if not output_dir: output_dir = "." else: os.makedirs(output_dir, exist_ok=True) # check if the platform name provided is supported by seqc # todo move into verify for run platform_name = verify.platform_name(args.platform) platform = platforms.AbstractPlatform.factory( platform_name) # returns platform n_processes = multiprocessing.cpu_count( ) - 1 # get number of processors merge, align, process_bamfile = determine_start_point(args) args = download_input(output_dir, args) if args.platform == "in_drop_v5": platform = platform.build_cb2_barcodes(args.barcode_files) log.notify("Built cb2 barcode hash for v5 barcodes.") if merge: if args.min_poly_t is None: # estimate min_poly_t if it was not provided args.min_poly_t = filter.estimate_min_poly_t( args.barcode_fastq, platform) log.notify("Estimated min_poly_t={!s}".format(args.min_poly_t)) args.merged_fastq = merge_fastq_files(platform, args.barcode_fastq, args.output_prefix, args.genomic_fastq) # SEQC was started from input other than fastq files if args.min_poly_t is None: args.min_poly_t = 0 log.warn( "Warning: SEQC started from step other than unmerged fastq with " "empty --min-poly-t parameter. Continuing with --min-poly-t 0." ) if align: upload_merged = args.upload_prefix if merge else None args.alignment_file, manage_merged = align_fastq_records( args.merged_fastq, output_dir, args.star_args, args.index, n_processes, upload_merged, ) else: manage_merged = None if process_bamfile: # if the starting point was a BAM file (i.e. args.alignment_file=*.bam & align=False) # do not upload by setting this to None upload_bamfile = args.upload_prefix if align else None ra, manage_bamfile, read_names = create_read_array( args.alignment_file, args.index, upload_bamfile, args.min_poly_t, max_insert_size, ) else: manage_bamfile = None ra = ReadArray.load(args.read_array) # fixme: the old read_array doesn't have read_names read_names = None # create the first summary section here status_filters_section = Section.from_status_filters( ra, "initial_filtering.html") sections = [status_filters_section] # Skip over the corrections if read array is specified by the user if not args.read_array: # Correct barcodes log.info("Correcting barcodes and estimating error rates.") error_rate, df_cb_correction = platform.apply_barcode_correction( ra, args.barcode_files) if df_cb_correction is not None and len(df_cb_correction) > 0: df_cb_correction.to_csv( args.output_prefix + "_cb-correction.csv.gz", index=False, compression="gzip", ) # Resolve multimapping log.info("Resolving ambiguous alignments.") mm_results = ra.resolve_ambiguous_alignments() # 121319782799149 / 614086965 / pos=49492038 / AAACATAACG # 121319782799149 / 512866590 / pos=49490848 / TCAATTAATC (1 hemming dist away from TCAATTAATT) # ra.data["rmt"][91490] = 512866590 # ra.positions[91490] = 49492038 # correct errors log.info("Identifying RMT errors.") df_umi_correction = platform.apply_rmt_correction(ra, error_rate) if df_umi_correction is not None and len(df_umi_correction) > 0: df_umi_correction.to_csv( args.output_prefix + "_umi-correction.csv.gz", index=False, compression="gzip", ) # Apply low coverage filter if platform.filter_lonely_triplets: log.info("Filtering lonely triplet reads") ra.filter_low_coverage(alpha=args.low_coverage_alpha) log.info("Saving read array.") ra.save(args.output_prefix + ".h5") # generate a file with read_name, corrected cb, corrected umi # read_name already has pre-corrected cb & umi # log.info("Saving correction information.") # ra.create_readname_cb_umi_mapping( # read_names, args.output_prefix + "_correction.csv.gz" # ) # Summary sections # create the sections for the summary object sections += [ Section.from_cell_barcode_correction( ra, "cell_barcode_correction.html"), Section.from_rmt_correction(ra, "rmt_correction.html"), Section.from_resolve_multiple_alignments( mm_results, "multialignment.html"), ] # create a dictionary to store output parameters mini_summary_d = dict() # filter non-cells log.info("Creating counts matrix.") sp_reads, sp_mols = ra.to_count_matrix(sparse_frame=True, genes_to_symbols=args.index + "annotations.gtf") # generate 10x compatible count matrix log.info("Creating 10x compatible counts matrix.") ra.to_10x_count_matrix(genes_to_symbols=args.index + "annotations.gtf") # Save sparse matrices log.info("Saving sparse matrices") scipy.io.mmwrite(args.output_prefix + "_sparse_read_counts.mtx", sp_reads.data) scipy.io.mmwrite(args.output_prefix + "_sparse_molecule_counts.mtx", sp_mols.data) # Indices df = np.array([np.arange(sp_reads.shape[0]), sp_reads.index]).T np.savetxt( args.output_prefix + "_sparse_counts_barcodes.csv", df, fmt="%d", delimiter=",", ) # Columns df = np.array([np.arange(sp_reads.shape[1]), sp_reads.columns]).T np.savetxt(args.output_prefix + "_sparse_counts_genes.csv", df, fmt="%s", delimiter=",") log.info("Creating filtered counts matrix.") cell_filter_figure = args.output_prefix + "_cell_filters.png" # By pass low count filter for mars seq ( sp_csv, total_molecules, molecules_lost, cells_lost, cell_description, ) = filter.create_filtered_dense_count_matrix( sp_mols, sp_reads, mini_summary_d, plot=True, figname=cell_filter_figure, filter_low_count=platform.filter_low_count, filter_mitochondrial_rna=args.filter_mitochondrial_rna, filter_low_coverage=args.filter_low_coverage, filter_low_gene_abundance=args.filter_low_gene_abundance, ) # Output files files = [ cell_filter_figure, args.output_prefix + ".h5", args.output_prefix + "_sparse_read_counts.mtx", args.output_prefix + "_sparse_molecule_counts.mtx", args.output_prefix + "_sparse_counts_barcodes.csv", args.output_prefix + "_sparse_counts_genes.csv", "raw_feature_bc_matrix/matrix.mtx.gz", "raw_feature_bc_matrix/barcodes.tsv.gz", "raw_feature_bc_matrix/features.tsv.gz", ] if os.path.exists(args.output_prefix + "_cb-correction.csv.gz"): files.append(args.output_prefix + "_cb-correction.csv.gz") if os.path.exists(args.output_prefix + "_umi-correction.csv.gz"): files.append(args.output_prefix + "_umi-correction.csv.gz") # Summary sections # create the sections for the summary object sections += [ Section.from_cell_filtering(cell_filter_figure, "cell_filtering.html"), Section.from_run_time(args.log_name, "seqc_log.html"), ] # get alignment summary if os.path.isfile(output_dir + "/alignments/Log.final.out"): os.rename( output_dir + "/alignments/Log.final.out", args.output_prefix + "_alignment_summary.txt", ) # Upload files and summary sections files += [args.output_prefix + "_alignment_summary.txt"] sections.insert( 0, Section.from_alignment_summary( args.output_prefix + "_alignment_summary.txt", "alignment_summary.html", ), ) cell_size_figure = args.output_prefix + "_cell_size_distribution.png" index_section = Section.from_final_matrix(sp_csv, cell_size_figure, "cell_distribution.html") seqc_summary = Summary(args.output_prefix + "_summary", sections, index_section) seqc_summary.prepare_archive() seqc_summary.import_image(cell_filter_figure) seqc_summary.import_image(cell_size_figure) seqc_summary.render() # create a .tar.gz with `test_summary/*` summary_archive = seqc_summary.compress_archive() files += [summary_archive] # Create a mini summary section alignment_summary_file = args.output_prefix + "_alignment_summary.txt" seqc_mini_summary = MiniSummary( output_dir, output_prefix, mini_summary_d, alignment_summary_file, cell_filter_figure, cell_size_figure, ) seqc_mini_summary.compute_summary_fields(ra, sp_csv) seqc_mini_summary_json, seqc_mini_summary_pdf = seqc_mini_summary.render( ) files += [seqc_mini_summary_json, seqc_mini_summary_pdf] # Running MAST for differential analysis # file storing the list of differentially expressed genes for each cluster de_gene_list_file = run_mast( seqc_mini_summary.get_counts_filtered(), seqc_mini_summary.get_clustering_result(), args.output_prefix, ) files += [de_gene_list_file] # adding the cluster column and write down gene-cell count matrix dense_csv = args.output_prefix + "_dense.csv" sp_csv.insert(loc=0, column="CLUSTER", value=seqc_mini_summary.get_clustering_result()) sp_csv.to_csv(dense_csv) files += [dense_csv] if args.upload_prefix: # Upload count matrices files, logs, and return bucket, key = io.S3.split_link(args.upload_prefix) for item in files: try: ec2.Retry(retries=5)(io.S3.upload_file)(item, bucket, key) item_name = item.split("/")[-1] log.info('Successfully uploaded %s to "%s%s".' % (item, args.upload_prefix, item_name)) except FileNotFoundError: log.notify( "Item %s was not found! Continuing with upload..." % item) if manage_merged: manage_merged.wait_until_complete() log.info('Successfully uploaded %s to "%s"' % (args.merged_fastq, args.upload_prefix)) if manage_bamfile: manage_bamfile.wait_until_complete() log.info('Successfully uploaded %s to "%s"' % (args.alignment_file, args.upload_prefix)) log.info("SEQC run complete.") end_run_time = pendulum.now() running_time = end_run_time - start_run_time log.info("Running Time={}".format(running_time.in_words())) # upload logs if args.upload_prefix: # upload logs (seqc_log.txt, nohup.log) bucket, key = io.S3.split_link(args.upload_prefix) for item in [args.log_name, "./nohup.log"]: try: # Make a copy of the file with the output prefix copyfile(item, args.output_prefix + "_" + item) print(args.output_prefix + "_" + item) ec2.Retry(retries=5)(io.S3.upload_file)( args.output_prefix + "_" + item, bucket, key) log.info('Successfully uploaded %s to "%s".' % (item, args.upload_prefix)) except FileNotFoundError: log.notify( "Item %s was not found! Continuing with upload..." % item) else: # move the log to output directory movefile(args.log_name, args.output_prefix + "_" + args.log_name) # todo local test does not send this email if mutt: email_body = ( '<font face="Courier New, Courier, monospace">' "SEQC RUN COMPLETE.\n\n" "The run log has been attached to this email and " "results are now available in the S3 location you specified: " '"%s"\n\n' % args.upload_prefix) email_body = email_body.replace("\n", "<br>").replace("\t", " ") email_user(summary_archive, email_body, args.email)
def run(args) -> None: """Run SEQC on the files provided in args, given specifications provided on the command line :param args: parsed argv, produced by seqc.parser(). This function is only called when args.subprocess_name is "run". """ # import inside module for pickle functionality # top 2 only needed for post-filtering import os import multiprocessing from seqc import log, ec2, platforms, io from seqc.sequence import fastq from seqc.alignment import star from seqc.email_ import email_user from seqc.read_array import ReadArray from seqc.core import verify, download from seqc import filter from seqc.sequence.gtf import GeneIntervals from seqc.summary.summary import Section, Summary import numpy as np import scipy.io from shutil import copyfile from seqc.summary.summary import MiniSummary from seqc.stats.mast import run_mast import logging logger = logging.getLogger('weasyprint') logger.handlers = [] # Remove the default stderr handler logger.setLevel(100) logger.addHandler(logging.FileHandler('weasyprint.log')) def determine_start_point(arguments) -> (bool, bool, bool): """ determine where seqc should start based on which parameters were passed. :param arguments: Namespace object, result of ArgumentParser.parse_args() :returns merge, align, process_bamfile: indicates whether merging, alignment, and processing bamfiles should be executed. """ if arguments.read_array: return False, False, False if arguments.alignment_file: return False, False, True if arguments.merged_fastq: return False, True, True else: return True, True, True def download_input(dir_, arguments): """parse input arguments and download any necessary data :param str dir_: directory to download data to :param arguments: namespace object from argparse :return args: updated namespace object reflecting local file paths of downloaded files """ # download basespace data if necessary if arguments.basespace: arguments.barcode_fastq, arguments.genomic_fastq = io.BaseSpace.download( arguments.platform, arguments.basespace, dir_, arguments.basespace_token) # check for remote fastq file links arguments.genomic_fastq = download.s3_data( arguments.genomic_fastq, dir_ + '/genomic_fastq/') arguments.barcode_fastq = download.s3_data( arguments.barcode_fastq, dir_ + '/barcode_fastq/') # get merged fastq file, unzip if necessary arguments.merged_fastq = ( download.s3_data([arguments.merged_fastq], dir_ + '/')[0] if arguments.merged_fastq is not None else None) # check if the index must be downloaded if any((arguments.alignment_file, arguments.read_array)): index_link = arguments.index + 'annotations.gtf' else: index_link = arguments.index download.s3_data([index_link], dir_ + '/index/') arguments.index = dir_ + '/index/' # check if barcode files must be downloaded arguments.barcode_files = download.s3_data( arguments.barcode_files, dir_ + '/barcodes/') # check if alignment_file needs downloading if arguments.alignment_file: arguments.alignment_file = download.s3_data( [arguments.alignment_file], dir_ + '/')[0] # check if readarray needs downloading if arguments.read_array: arguments.read_array = download.s3_data([arguments.read_array], dir_ + '/')[0] return arguments def merge_fastq_files( technology_platform, barcode_fastq: [str], output_stem: str, genomic_fastq: [str]) -> (str, int): """annotates genomic fastq with barcode information; merging the two files. :param technology_platform: class from platforms.py that defines the characteristics of the data being processed :param barcode_fastq: list of str names of fastq files containing barcode information :param output_stem: str, stem for output files :param genomic_fastq: list of str names of fastq files containing genomic information :returns str merged_fastq: name of merged fastq file """ log.info('Merging genomic reads and barcode annotations.') merged_fastq = fastq.merge_paired( merge_function=technology_platform.merge_function, fout=output_stem + '_merged.fastq', genomic=genomic_fastq, barcode=barcode_fastq) # delete genomic/barcode fastq files after merged.fastq creation log.info('Removing original fastq file for memory management.') delete_fastq = ' '.join(['rm'] + genomic_fastq + barcode_fastq) io.ProcessManager(delete_fastq).run_all() return merged_fastq def align_fastq_records( merged_fastq, dir_, star_args, star_index, n_proc, aws_upload_key) -> (str, str, io.ProcessManager): """ Align fastq records. :param merged_fastq: str, path to merged .fastq file :param dir_: str, stem for output files :param star_args: dict, extra keyword arguments for STAR :param star_index: str, file path to directory containing STAR index :param n_proc: int, number of STAR processes to initiate :param aws_upload_key: str, location to upload files, or None if seqc was initiated from a merged fastq file. :return bamfile, input_data, upload_manager: (str, str, io.ProcessManager) name of .sam file containing aligned reads, indicator of which data was used as input, and a ProcessManager for merged fastq files """ log.info('Aligning merged fastq records.') alignment_directory = dir_ + '/alignments/' os.makedirs(alignment_directory, exist_ok=True) if star_args is not None: star_kwargs = dict(a.strip().split('=') for a in star_args) else: star_kwargs = {} bamfile = star.align( merged_fastq, star_index, n_proc, alignment_directory, **star_kwargs) if aws_upload_key: log.info('Gzipping merged fastq file.') if pigz: pigz_zip = "pigz --best -k -f {fname}".format(fname=merged_fastq) else: pigz_zip = "gzip -kf {fname}".format(fname=merged_fastq) pigz_proc = io.ProcessManager(pigz_zip) pigz_proc.run_all() pigz_proc.wait_until_complete() # prevents slowing down STAR alignment merged_fastq += '.gz' # reflect gzipped nature of file log.info('Uploading gzipped merged fastq file to S3.') merge_upload = 'aws s3 mv {fname} {s3link}'.format( fname=merged_fastq, s3link=aws_upload_key) upload_manager = io.ProcessManager(merge_upload) upload_manager.run_all() else: log.info('Removing merged fastq file for memory management.') rm_merged = 'rm %s' % merged_fastq io.ProcessManager(rm_merged).run_all() upload_manager = None return bamfile, upload_manager def create_read_array(bamfile, index, aws_upload_key, min_poly_t, max_transcript_length): """Create or download a ReadArray object. :param max_transcript_length: :param str bamfile: filename of .bam file :param str index: directory containing index files :param str aws_upload_key: key where aws files should be uploaded :param int min_poly_t: minimum number of poly_t nucleotides for a read to be valid :returns ReadArray, UploadManager: ReadArray object, bamfile ProcessManager """ log.info('Filtering aligned records and constructing record database.') # Construct translator translator = GeneIntervals( index + 'annotations.gtf', max_transcript_length=max_transcript_length) read_array = ReadArray.from_alignment_file( bamfile, translator, min_poly_t) # converting sam to bam and uploading to S3, else removing bamfile if aws_upload_key: log.info('Uploading bam file to S3.') upload_bam = 'aws s3 mv {fname} {s3link}{prefix}_Aligned.out.bam'.format( fname=bamfile, s3link=aws_upload_key, prefix=args.output_prefix) print(upload_bam) upload_manager = io.ProcessManager(upload_bam) upload_manager.run_all() else: log.info('Removing bamfile for memory management.') rm_bamfile = 'rm %s' % bamfile io.ProcessManager(rm_bamfile).run_all() upload_manager = None return read_array, upload_manager # ######################## MAIN FUNCTION BEGINS HERE ################################ log.setup_logger(args.log_name) with ec2.instance_clean_up( email=args.email, upload=args.upload_prefix, log_name=args.log_name, debug=args.debug, terminate=args.terminate ): pigz, mutt = verify.executables('pigz', 'mutt') if mutt: log.notify('mutt executable identified, email will be sent when run ' 'terminates. ') else: log.notify('mutt was not found on this machine; an email will not be sent to ' 'the user upon termination of SEQC run.') # turn off lower coverage filter for 10x if (args.platform == "ten_x") or (args.platform == "ten_x_v2") or (args.platform == "ten_x_v3"): args.filter_low_coverage = False max_insert_size = args.max_insert_size if args.filter_mode == "scRNA-seq": # for scRNA-seq if (args.platform == "ten_x") or (args.platform == "ten_x_v2") or (args.platform == "ten_x_v3"): # set max_transcript_length (max_insert_size) = 10000 max_insert_size = 10000 log.notify("Full length transcripts are used for read mapping in 10x data.") elif args.filter_mode == "snRNA-seq": # for snRNA-seq # e.g. 2304700 # hg38 # e.g. 4434881 # mm38 max_insert_size = args.max_insert_size else: # all others max_insert_size = args.max_insert_size log.notify("max_insert_size is set to {}".format(max_insert_size)) log.args(args) output_dir, output_prefix = os.path.split(args.output_prefix) if not output_dir: output_dir = '.' # check if the platform name provided is supported by seqc # todo move into verify for run platform_name = verify.platform_name(args.platform) platform = platforms.AbstractPlatform.factory(platform_name) # returns platform n_processes = multiprocessing.cpu_count() - 1 # get number of processors merge, align, process_bamfile = determine_start_point(args) args = download_input(output_dir, args) if args.platform == "in_drop_v5": platform = platform.build_cb2_barcodes(args.barcode_files) log.notify("Built cb2 barcode hash for v5 barcodes.") if merge: if args.min_poly_t is None: # estimate min_poly_t if it was not provided args.min_poly_t = filter.estimate_min_poly_t( args.barcode_fastq, platform) log.notify('Estimated min_poly_t={!s}'.format(args.min_poly_t)) args.merged_fastq = merge_fastq_files( platform, args.barcode_fastq, args.output_prefix, args.genomic_fastq) # SEQC was started from input other than fastq files if args.min_poly_t is None: args.min_poly_t = 0 log.notify('Warning: SEQC started from step other than unmerged fastq with ' 'empty --min-poly-t parameter. Continuing with --min-poly-t 0.') if align: upload_merged = args.upload_prefix if merge else None args.alignment_file, manage_merged = align_fastq_records( args.merged_fastq, output_dir, args.star_args, args.index, n_processes, upload_merged) else: manage_merged = None if process_bamfile: upload_bamfile = args.upload_prefix if align else None ra, manage_bamfile, = create_read_array( args.alignment_file, args.index, upload_bamfile, args.min_poly_t, max_insert_size) else: manage_bamfile = None ra = ReadArray.load(args.read_array) # create the first summary section here status_filters_section = Section.from_status_filters(ra, 'initial_filtering.html') sections = [status_filters_section] # Skip over the corrections if read array is specified by the user if not args.read_array: # Correct barcodes log.info('Correcting barcodes and estimating error rates.') error_rate = platform.apply_barcode_correction(ra, args.barcode_files) # Resolve multimapping log.info('Resolving ambiguous alignments.') mm_results = ra.resolve_ambiguous_alignments() # correct errors log.info('Identifying RMT errors.') platform.apply_rmt_correction(ra, error_rate) # Apply low coverage filter if platform.filter_lonely_triplets: log.info('Filtering lonely triplet reads') ra.filter_low_coverage(alpha=args.low_coverage_alpha) log.info('Saving read array.') ra.save(args.output_prefix + '.h5') # Summary sections # create the sections for the summary object sections += [ Section.from_cell_barcode_correction(ra, 'cell_barcode_correction.html'), Section.from_rmt_correction(ra, 'rmt_correction.html'), Section.from_resolve_multiple_alignments(mm_results, 'multialignment.html')] # create a dictionary to store output parameters mini_summary_d = dict() # filter non-cells log.info('Creating counts matrix.') sp_reads, sp_mols = ra.to_count_matrix( sparse_frame=True, genes_to_symbols=args.index + 'annotations.gtf') # Save sparse matrices log.info('Saving sparse matrices') scipy.io.mmwrite(args.output_prefix + '_sparse_read_counts.mtx', sp_reads.data) scipy.io.mmwrite(args.output_prefix + '_sparse_molecule_counts.mtx', sp_mols.data) # Indices df = np.array([np.arange(sp_reads.shape[0]), sp_reads.index]).T np.savetxt( args.output_prefix + '_sparse_counts_barcodes.csv', df, fmt='%d', delimiter=',') # Columns df = np.array([np.arange(sp_reads.shape[1]), sp_reads.columns]).T np.savetxt( args.output_prefix + '_sparse_counts_genes.csv', df, fmt='%s', delimiter=',') log.info('Creating filtered counts matrix.') cell_filter_figure = args.output_prefix + '_cell_filters.png' # By pass low count filter for mars seq sp_csv, total_molecules, molecules_lost, cells_lost, cell_description = ( filter.create_filtered_dense_count_matrix( sp_mols, sp_reads, mini_summary_d, plot=True, figname=cell_filter_figure, filter_low_count=platform.filter_low_count, filter_mitochondrial_rna=args.filter_mitochondrial_rna, filter_low_coverage=args.filter_low_coverage, filter_low_gene_abundance=args.filter_low_gene_abundance)) # Output files files = [cell_filter_figure, args.output_prefix + '.h5', args.output_prefix + '_sparse_read_counts.mtx', args.output_prefix + '_sparse_molecule_counts.mtx', args.output_prefix + '_sparse_counts_barcodes.csv', args.output_prefix + '_sparse_counts_genes.csv'] # Summary sections # create the sections for the summary object sections += [ Section.from_cell_filtering(cell_filter_figure, 'cell_filtering.html'), Section.from_run_time(args.log_name, 'seqc_log.html')] # get alignment summary if os.path.isfile(output_dir + '/alignments/Log.final.out'): os.rename(output_dir + '/alignments/Log.final.out', output_dir + '/' + args.output_prefix + '_alignment_summary.txt') # Upload files and summary sections files += [output_dir + '/' + args.output_prefix + '_alignment_summary.txt'] sections.insert( 0, Section.from_alignment_summary( output_dir + '/' + args.output_prefix + '_alignment_summary.txt', 'alignment_summary.html')) cell_size_figure = 'cell_size_distribution.png' index_section = Section.from_final_matrix( sp_csv, cell_size_figure, 'cell_distribution.html') seqc_summary = Summary( output_dir + '/' + args.output_prefix + '_summary', sections, index_section) seqc_summary.prepare_archive() seqc_summary.import_image(cell_filter_figure) seqc_summary.import_image(cell_size_figure) seqc_summary.render() summary_archive = seqc_summary.compress_archive() files += [summary_archive] # Create a mini summary section alignment_summary_file = output_dir + '/' + args.output_prefix + '_alignment_summary.txt' seqc_mini_summary = MiniSummary( args.output_prefix, mini_summary_d, alignment_summary_file, cell_filter_figure, cell_size_figure) seqc_mini_summary.compute_summary_fields(ra, sp_csv) seqc_mini_summary_json, seqc_mini_summary_pdf = seqc_mini_summary.render() files += [seqc_mini_summary_json, seqc_mini_summary_pdf] # Running MAST for differential analysis # file storing the list of differentially expressed genes for each cluster de_gene_list_file = run_mast( seqc_mini_summary.get_counts_filtered(), seqc_mini_summary.get_clustering_result(), args.output_prefix) files += [de_gene_list_file] # adding the cluster column and write down gene-cell count matrix dense_csv = args.output_prefix + '_dense.csv' sp_csv.insert(loc=0, column='CLUSTER', value=seqc_mini_summary.get_clustering_result()) sp_csv.to_csv(dense_csv) files += [dense_csv] if args.upload_prefix: # Upload count matrices files, logs, and return bucket, key = io.S3.split_link(args.upload_prefix) for item in files: try: ec2.Retry(retries=5)(io.S3.upload_file)(item, bucket, key) item_name = item.split('/')[-1] log.info('Successfully uploaded %s to the specified S3 location ' '"%s%s".' % (item, args.upload_prefix, item_name)) except FileNotFoundError: log.notify('Item %s was not found! Continuing with upload...' % item) if manage_merged: manage_merged.wait_until_complete() log.info('Successfully uploaded %s to the specified S3 location "%s"' % (args.merged_fastq, args.upload_prefix)) if manage_bamfile: manage_bamfile.wait_until_complete() log.info('Successfully uploaded %s to the specified S3 location "%s"' % (args.alignment_file, args.upload_prefix)) log.info('SEQC run complete. Cluster will be terminated') # upload logs if args.upload_prefix: # Upload count matrices files, logs, and return bucket, key = io.S3.split_link(args.upload_prefix) for item in [args.log_name, './nohup.log']: try: # Make a copy of the file with the output prefix copyfile(item, args.output_prefix + '_' + item) print(args.output_prefix + '_' + item) ec2.Retry(retries=5)(io.S3.upload_file)( args.output_prefix + '_' + item, bucket, key) log.info('Successfully uploaded %s to the specified S3 location ' '"%s".' % (item, args.upload_prefix)) except FileNotFoundError: log.notify('Item %s was not found! Continuing with upload...' % item) # todo local test does not send this email if mutt: email_body = ( '<font face="Courier New, Courier, monospace">' 'SEQC RUN COMPLETE.\n\n' 'The run log has been attached to this email and ' 'results are now available in the S3 location you specified: ' '"%s"\n\n' % args.upload_prefix) email_body = email_body.replace('\n', '<br>').replace('\t', ' ') email_user(summary_archive, email_body, args.email)
def index(args): """create an index for SEQC. :param args: parsed arguments. This function is only called if subprocess_name is 'index' """ # functions to be pickled and run remotely must import all their own modules import sys import logging from seqc import ec2, log, io from seqc.sequence.index import Index from seqc.alignment import star from seqc import version logging.basicConfig( level=logging.DEBUG, handlers=[ logging.FileHandler(args.log_name), logging.StreamHandler(sys.stdout), ], ) log.info("SEQC=v{}".format(version.__version__)) log.info("STAR=v{}".format(star.get_version())) log.args(args) with ec2.instance_clean_up( email=args.email, upload=args.upload_prefix, log_name=args.log_name, debug=args.debug, terminate=args.terminate, running_remote=args.remote, ): idx = Index(args.organism, args.ids, args.folder) idx.create_index( s3_location=args.upload_prefix, ensemble_release=args.ensemble_release, read_length=args.read_length, valid_biotypes=args.valid_biotypes, ) # upload the log file (seqc_log.txt, nohup.log, Log.out) if args.upload_prefix: bucket, key = io.S3.split_link(args.upload_prefix) for item in [args.log_name, "./nohup.log", "./Log.out"]: try: ec2.Retry(retries=5)(io.S3.upload_file)(item, bucket, key) log.info( "Successfully uploaded {} to {}".format( item, args.upload_prefix ) ) except FileNotFoundError: log.notify( "Item {} was not found! Continuing with upload...".format(item) ) log.info("DONE.")
def in_drop(ra, platform, barcode_files, max_ed=2, default_error_rate=0.02): """ Correct reads with incorrect barcodes according to the correct barcodes files. Reads with barcodes that have too many errors are filtered out. :param ra: seqc.read_array.ReadArray object :param platform: the platform object :param barcode_files: the list of the paths of barcode files :param max_ed: maximum allowed Hamming distance from known cell barcodes :param default_error_rate: assumed sequencing error rate :return: """ # Read the barcodes into lists valid_barcodes = [] for barcode_file in barcode_files: with open(barcode_file, "r") as f: valid_barcodes.append( set(DNA3Bit.encode(line.strip()) for line in f.readlines())) # Containers num_barcodes = platform.num_barcodes correct = [None] * num_barcodes edit_dist = [None] * num_barcodes # Error table container errors = [p for p in permutations(DNA3Bit.bin2strdict.keys(), r=2)] error_table = dict(zip(errors, np.zeros(len(errors)))) cor_instance_table = dict( zip(DNA3Bit.bin2strdict.keys(), np.zeros(len(DNA3Bit.bin2strdict)))) # Check if the barcode has to be an exact match exact_match = False if max_ed == 0: exact_match = True # Group reads by cells indices_grouped_by_cells = ra.group_indices_by_cell(multimapping=True) for inds in indices_grouped_by_cells: # Extract barcodes for one of the reads barcodes = platform.extract_barcodes(ra.data["cell"][inds[0]]) # Identify correct barcode for i in range(num_barcodes): correct[i], edit_dist[ i] = seqc.sequence.barcodes.find_correct_barcode( barcodes[i], valid_barcodes[i], exact_match) # 1. If all edit distances are 0, barcodes are correct, # update the correct instance table # 2. Correct any barcodes within permissible edit distance, # update the correct instance table for non-errored bases, # update error table for the errored bases # 3. Mark the uncorrectable barcodes as cell errors if all(np.array(edit_dist) == 0): # Temp container to increment the correct instance counter tmp_bc = DNA3Bit.ints2int(barcodes) while tmp_bc > 0: cor_instance_table[tmp_bc & 0b111] += 1 tmp_bc >>= 3 elif max(edit_dist) > max_ed: ra.data["status"][inds] |= ra.filter_codes["cell_error"] continue else: # These barcodes can be corrected, Count the number of correct bases # Update the error table if there was only one error across the barcodes tmp_bc = DNA3Bit.ints2int(barcodes) tmp_cor = DNA3Bit.ints2int(correct) # Update the read array with the correct barcode ra.data["cell"][inds] = tmp_cor # Iterating through the sequences while tmp_bc > 0: if tmp_bc & 0b111 == tmp_cor & 0b111: cor_instance_table[tmp_bc & 0b111] += 1 elif sum(edit_dist) == 1: error_table[(tmp_cor & 0b111, tmp_bc & 0b111)] += 1 tmp_bc >>= 3 tmp_cor >>= 3 # Create error rate table if sum(error_table.values()) == 0: log.info("No errors were detected or barcodes do not support error " "correction, using %f uniform error chance." % default_error_rate) err_rate = dict(zip(errors, [default_error_rate] * len(errors))) # todo @Manu bug here, we're always setting the error rate even if there are # no detected errors. should the following line be in an "else" clause? err_rate = dict(zip(errors, [0.0] * len(errors))) for k, v in error_table.items(): if DNA3Bit.decode(k[0]) in b"Nn": continue try: err_rate[k] = v / (sum(n for err_type, n in error_table.items() if err_type[0] == k[0]) + cor_instance_table[k[0]]) except ZeroDivisionError: log.info( "Warning: too few reads to estimate error rate for %s, setting " "default rate of %f" % (str(DNA3Bit.decode(k)), default_error_rate)) err_rate[k] = default_error_rate return err_rate, None