def check_target_blat(self): ''' ''' # am = sv_caller.align_manager(meta_dict) # hit, self.query_res_fn = am.check_target_results() hit = False utils.log(self.logging_name, 'info', 'Checking if target blat contains most of query or if whole genome needs to queried.') if not self.realign_results.has_results: self.query_res_fn = None hit = True else: self.realign_results.write_mod_result_file(self.query_res_fn + '.mod') #print 'Checking target results' #print len(self.bm.blat_results) #print self.bm.get_query_coverage() #print self.bm.nmismatches #print self.bm.ngaps if self.realign_results.target_hit(): hit = True utils.log(self.logging_name, 'debug', 'Top hit contains whole query sequence, indel variant') self.query_res_fn += '.mod' # else: # if max(contig_counts.others) < params.get_sr_thresh('trl'): # hit = True # self.query_res_fn = None return hit # , self.query_res_fn
def set_reference_kmers(self, targetRefFns): """Set the reference sequence kmers""" self.kmers['ref'] = {} for i in range(len(targetRefFns)): utils.log(self.loggingName, 'info', 'Indexing kmers for reference sequence %s' % targetRefFns[i]) self.get_kmers(targetRefFns[i], self.kmers['ref'])
def which_rearr(self, varReads, tcoords, qcoords, strands, brkpts): rearrValues = {'discReadCount': None, 'svType': 'rearrangement', 'svSubType': None, 'hit': False} if not self.check_overlap(tcoords[0], tcoords[1]): utils.log(self.loggingName, 'debug', 'Checking rearrangement svType, strand1 %s, strand2 %s, breakpt1 %d, breakpt %d' % (strands[0], strands[1], brkpts[0], brkpts[1])) if (strands[0] != strands[1]): # and (brkpts[0] < brkpts[1]): # Inversion # Get discordantly mapped read-pairs utils.log(self.loggingName, 'debug', 'Inversion event identified.') rearrValues['hit'] = True rearrValues['svSubType'] = 'inversion' rearrValues['discReadCount'] = varReads.check_inv_readcounts(brkpts) elif (strands[0] == strands[1]): tgap = brkpts[1] - brkpts[0] qgap = qcoords[1][0] - qcoords[0][1] if tgap < 0: utils.log(self.loggingName, 'debug', 'Tandem duplication event identified.') rearrValues['hit'] = True rearrValues['svSubType'] = 'tandem_dup' rearrValues['discReadCount'] = varReads.check_td_readcounts(brkpts) elif tgap > qgap: # Gapped deletion from Blast result utils.log(self.loggingName, 'debug', 'Deletion event identified.') rearrValues['hit'] = True rearrValues['svType'] = 'indel' rearrValues['indelSize'] = 'D' + str(tgap) else: # Gapped insertion from Blast result utils.log(self.loggingName, 'debug', 'Insertion event identified.') rearrValues['hit'] = True rearrValues['svType'] = 'indel' rearrValues['indelSize'] = 'I' + str(qgap) return rearrValues
def resolve_sv(self): ''' ''' utils.log(self.logging_name, 'info', 'Resolving structural variants from %d kmer clusters' % len(self.contigs)) self.results = self.call_manager.resolve_sv_calls(self.contigs, self.files['target_ref_fn'][0], self.get_values(), self.disc_reads)
def set_values(self, contigId, params, queryRegionValues, contigPath, readVariation): """Sets the contig values after contig has been compeleted and ready for realignment. Args: contigId: String containing contid ID. params: Param object. queryRegionValues: Tuple containing the target region information contigPath: String of the path to the contig directory to store files. Return: None """ self.params = params self.id = contigId self.readVariation = readVariation self.chr = queryRegionValues[0] self.start = int(queryRegionValues[1]) self.end = int(queryRegionValues[2]) self.targetName = queryRegionValues[3] self.regionBuffer = queryRegionValues[5] self.path = os.path.join(contigPath, self.id) logger = logging.getLogger('breakmer.assembly.contig') utils.log(self.loggingName, 'info', 'Setting up contig path %s' % self.path) if not os.path.exists(self.path): os.makedirs(self.path) self.fq_fn = os.path.join(contigPath, self.id, self.id + '.fq') self.fa_fn = os.path.join(contigPath, self.id, self.id + '.fa')
def check_indel(self, nBlatResults): indel = False utils.log(self.loggingName, 'info', 'Checking if blat result contains an indel variant') if (self.spans_query() or (nBlatResults == 1 and self.in_target)) and (self.ngaps > 0): utils.log(self.loggingName, 'info', 'Blat result spans query (%r) or only one blat result (%r) and blat result in target (%r)' % (self.spans_query(), (nBlatResults == 1), self.in_target)) indel = True return indel
def get_brkpt_counts_filt(self, brkpts, sv_type): # print 'Contig seq', self.contig_seq # print 'Breakpoint simple repeat filter', brkpts['f'] avg_comp, comp_vec = utils.calc_contig_complexity(self.contig_seq) # print 'Contig avg complexity', avg_comp # print 'Contig complexity vec', comp_vec brkpt_rep_filt = False brkpt_counts = {'n':[],'d':[],'b':[]} brkpt_kmers = [] for qb in brkpts['q'][1]: left_idx = qb[0] - min(qb[1],5) right_idx = qb[0] + min(qb[2],5) # print self.contig_rcounts.others # print self.contig_rcounts.indel_only # print qb[0], left_idx, right_idx bc = self.contig_rcounts.get_counts(left_idx, right_idx, sv_type) brkpt_counts['n'].append(min(bc)) brkpt_counts['d'].append(min(self.contig_rcounts.get_counts((qb[0]-1), (qb[0]+1), sv_type))) # print 'Others counts', self.contig_rcounts.others, qb[0] # print 'Indel only counts', self.contig_rcounts.indel_only, qb[0] brkpt_counts['b'].append(self.contig_rcounts.get_counts(qb[0], qb[0], sv_type)) brkpt_kmers.append(self.contig_kmer_locs[qb[0]]) # print 'Breakpoint in contig', qb[0] brkpt_rep_filt = brkpt_rep_filt or (comp_vec[qb[0]] < (avg_comp/2)) # print 'Breakpoint rep filter', brkpt_rep_filt, comp_vec[qb[0]] utils.log(self.logging_name, 'debug', 'Read count around breakpoint %d: %s'%(qb[0],",".join([str(x) for x in bc]))) utils.log(self.logging_name, 'debug', 'Kmer count around breakpoints %s'%(",".join([str(x) for x in brkpt_kmers]))) brkpt_rep_filt = brkpt_rep_filt or (len(filter(lambda x: x, brkpts['f'])) > 0) return brkpt_counts, brkpt_kmers, brkpt_rep_filt
def analyze_targets(targetList): """Analyze a list of targets. A list of TargetManager objects are passed in to be analyzed independently. Each target ref data is set, if necessary, then the reads are extracted, contigs built, and calls made. This function performs all the top level functions on the target regions being analyzed. Args: targetList (list): A list of TargetManager objects, representing target regions. Returns: aggregateResults (dict): A dictoinary containing lists of formatted output strings for the contig-based calls and the discordant-only read clusters. Raises: None """ aggregateResults = {'contigs': [], 'discreads': []} # Formatted output strings for contig based calls and discordant read calls are different. for targetRegion in targetList: # print 'Analyzing', targetRegion.name utils.log('breakmer.processor.analysis', 'info', 'Analyzing %s' % targetRegion.name) targetRegion.set_ref_data() if targetRegion.fnc == 'prepare_reference_data': # Stop here if only preparing ref data. continue if not targetRegion.find_sv_reads(): # No SV reads extracted. Exiting. continue targetRegion.compare_kmers() # Perform kmer subtraction. targetRegion.resolve_sv() # Assemble extracted reads and make calls. if targetRegion.has_results(): outputs = targetRegion.get_formatted_output() for key in outputs: aggregateResults[key].extend(outputs[key]) targetRegion.complete_analysis() # Write results out to file. return aggregateResults
def realignment(self, contig, target_ref_fa_fn, target_region_values): ''' ''' if contig.contig_fa_fn is None: return self.query_res_fn = os.path.join(contig.file_path, 'blat_res.target.psl') realign_dict = {'binary': self.params.get_param('blat'), 'database': target_ref_fa_fn } utils.run_blat(realign_dict, self.query_res_fn, contig.contig_fa_fn, 'target') # Run blat against target reference sequence first for speed. if self.query_res_fn is None: utils.log(self.logging_name, 'info', 'No blat results file %s, no calls for %s.' % (self.query_res_fn, contig.contig_id)) return self.realign_results = RealignResultSet(self.params, self.query_res_fn, target_region_values, 'target') if not self.check_target_blat(): # Blat against whole genome reference fasta realign_dict = {'binary': self.params.get_param('gfclient'), 'blat_port': self.params.get_param('blat_port'), 'database': self.params.get_param('reference_fasta_dir') } self.query_res_fn = os.path.join(contig.file_path, 'blat_res.genome.psl') utils.run_blat(realign_dict, self.query_res_fn, contig.contig_fa_fn, 'genome') self.realign_results = RealignResultSet(self.params, self.query_res_fn, target_region_values, 'genome') return self.realign_results
def target_hit(self): #single_res = (len(self.blat_results)==1) and (self.get_query_coverage() >= 90.0) #mult_res1 = (len(self.blat_results)>1 and (self.get_query_coverage()>=95.0) and (self.nmismatches<5) and (self.ngaps<3)) #mult_res2 = self.blat_results[0][3].spans_query() and (self.blat_results[0][3].get_nmatches('mis')<5) and (self.blat_results[0][3].get_num_gaps()<3) # Single hit with 90% identity. indel_hit = self.blat_results[0][3].spans_query() or (len(self.blat_results) == 1 and self.get_query_coverage() >= 90.0) # single_res = (len(self.blat_results)==1) and (self.get_query_coverage() >= 90.0) # mult_res1 = False # mult_res2 = False # Or multiple hits # if len(self.blat_results) > 1: # mult_res1 = self.blat_results[0][3].get_query_coverage() >= 90.0 # print 'blat percent ident', self.blat_results[0][3].perc_ident # qcov = [0]*self.qsize # for br in self.blat_results: # Calculate the total query coverage for blat results with 95% identity # print 'Multiple hit', br[3].perc_ident # if br[3].perc_ident >= 95.0: # qcov[br[3].qstart():br[3].qend()] = map(lambda x: x+1, qcov[br[3].qstart():br[3].qend()]) # nhits = 0 # for i in qcov: # if i>0: nhits += 1 # total_query_cov = round((float(nhits)/float(self.qsize))*100,2) # mult_res2 = total_query_cov >= 95.0 # sys.exit() utils.log(self.logging_name, 'debug', 'Checking if query is a target hit or not %r' % indel_hit) return indel_hit #single_res or mult_res1 or mult_res2
def set_params(self): """Organize and format all input parameters into class variables to access later. Specific instances of parameters are checked and set. All the parameters that are set are logged. The target objects are set along with the paths. Args: None Returns: None Raises: None """ log_msgs = self.parse_opts() # Parse the config file and command line parameters into the self.opts dictionary. utils.setup_logger(self.get_param('analysis_dir', True), 'breakmer') # Create logging object. # Log the parameter setup after the logging object is created. utils.log(self.logging_name, 'info', 'Setting up parameters') # Setup the logger first before checking required params - log if there is a missing parameter. self.check_required_params() # Log all parameters passed in, warn for poor paths for param_key, param_value in self.opts.items(): utils.log(self.logging_name, 'info', '%s = %s' % (param_key, param_value)) # Log parameter overwritten by configuration file input values. for log_msg in log_msgs: utils.log(self.logging_name, log_msg[0], log_msg[1]) self.set_targets() self.gene_annotations = Anno() self.gene_annotations.add_genes(self.get_param('gene_annotation_file')) self.paths['ref_data'] = os.path.abspath(os.path.normpath(self.opts['reference_data_dir'])) # Path to target reference sequence fast files. self.set_param('reference_fasta_dir', os.path.split(self.opts['reference_fasta'])[0]) # Path to genome fasta file. # Setup directories self.paths['analysis'] = os.path.abspath(os.path.normpath(self.opts['analysis_dir'])) self.paths['output'] = os.path.join(self.paths['analysis'], 'output') if 'targets_dir' in self.opts: self.paths['targets'] = os.path.abspath(os.path.normpath(self.opts['targets_dir'])) else: self.paths['targets'] = os.path.join(self.paths['analysis'], 'targets') # Create all the paths. for path in self.paths: utils.log(self.logging_name, 'info', 'Creating %s directory (%s)' % (path, self.paths[path])) if not os.path.exists(self.paths[path]): os.makedirs(self.paths[path]) # If starting the blat server then return. if self.fnc_cmd == 'start_blat_server': utils.log(self.logging_name, 'info', 'Starting the blat server.') return self.check_binaries() # Check if Jellyfish and Cutadapt work. # self.filter = resultfilter.ResultFilter(self.get_param('filterList'), self) # Instantiate the filter class. self.set_insertsize_thresh() # Set the expected insert size threshold from the properly mapped read
def minseq_complexity(self, seq, N): utils.log(self.logging_name, 'debug', 'Checking sequence complexity of blat result segment %s using %d-mers' % (seq, N)) nmers = {} total_possible = len(seq) - 2 for i in range(len(seq) - (N - 1)): nmers[str(seq[i:i+N]).upper()] = True complexity = round((float(len(nmers))/float(total_possible))*100,4) utils.log(self.logging_name, 'debug', 'Complexity measure %f, based on %d unique %d-mers observed out of a total of %d %d-mers possible' % (complexity, len(nmers), N, total_possible, N)) return complexity
def set_sample_kmers(self): """Set the sample kmers """ utils.log(self.loggingName, 'info', 'Indexing kmers for sample sequence %s' % self.files['sv_cleaned_fq']) self.kmers['case'] = {} self.kmers['case_sc'] = {} self.get_kmers(self.files['sv_cleaned_fq'], self.kmers['case']) self.get_kmers(self.files['sv_sc_unmapped_fa'], self.kmers['case_sc'])
def get_target_intervals(self, targetName): """Return the stored intervals for a specific target. """ if targetName in self.targets: return self.targets[targetName] else: utils.log(self.loggingName, 'debug', '%s target name not in target dictionary.' % targetName) sys.exit(1)
def finalize_contigs(self): ''' ''' utils.log(self.logging_name, 'info', 'Finalizing %d assembled contigs' % len(self.contigs)) for contig_iter, assembled_contig in enumerate(self.contigs): utils.log(self.logging_name, 'info', 'Finalizing contig %s' % assembled_contig.seq.value) contig_id = self.name + '-contig' + str(contig_iter + 1) assembled_contig.write_contig_values(contig_id, self.files['kmer_clusters'], self.paths['contigs'])
def write_result(self, svEventResult, outputPath): resultFn = os.path.join(self.path, self.id + "_svs.out") utils.log(self.loggingName, 'info', 'Writing %s result file %s' % (self.id, resultFn)) resultFile = open(resultFn, 'w') # A string of output values for writing to file. headerStr, formattedResultValuesStr = svEventResult.get_formatted_output_values() resultFile.write(headerStr + '\n' + formattedResultValuesStr + '\n') resultFile.close() shutil.copyfile(resultFn, os.path.join(outputPath, self.id + "_svs.out"))
def set_targets(self): """Parse the targets bed file and store them in a dictionary. Limit to a gene list if input. A list of genes can be passed in by the user to limit the analysis. This will limit which targets are stored in the dictionary as the target bed file is parsed. The target bed file is a tab-delimited text file that should have at minimum, four columns (chromosome, start, end, name) with an optional fourth column containing a coding feature (i.e., exon or intron). Each row is either a tiled region with sequencing coverage or it is just a region to analyze by BreaKmer. The name can be applied to multiple rows, and if multiple tiled regions are input with the same name they are aggregated together under the same key. Store the target information in the self.target dictionary with the name as the key and a list of tuples of interval genomic locations as the values. self.target[gene_name] = [(chrom, start_bp, end_bp, name, feature),...] Args: None Returns: None Raises: None """ # Get the gene list file path if it exists. gene_list = self.get_param('gene_list') region_list = None if gene_list: region_list = [] # Each line contains a gene name. for line in open(gene_list, 'r'): region_list.append(line.strip().upper()) utils.log(self.logging_name, 'info', 'Parsing target list') # TODO: Check to make sure there aren't duplicate genes. with open(self.get_param('targets_bed_file'), 'rU') as target_bed_file: for target in target_bed_file: # Each target is formatted like a bed, chr bp1 bp2 name target = target.strip() targetsplit = target.split() chrm, bp1, bp2, name = targetsplit[0:4] if region_list: if name.upper() not in region_list: continue # Allow a fifth column containing indication of what type of region it is. # Typically exon/intron designation. This will be deprecated. feature = None if len(targetsplit) <= 4 else targetsplit[4] self.targets.setdefault(name.upper(), []) self.targets[name.upper()].append((chrm, int(bp1), int(bp2), name, feature)) utils.log(self.logging_name, 'info', '%d targets' % len(self.targets))
def which_rearr(self, disc_reads, brkpts, strands, tcoords, qcoords): ''' ''' rearr_values = {'disc_read_count': None, 'sv_type': 'rearrangement', 'sv_subtype': None, 'hit': False} if not self.check_overlap(tcoords[0], tcoords[1]): utils.log(self.logging_name, 'debug', 'Checking rearrangement svType, strand1 %s, strand2 %s, breakpt1 %d, breakpt %d' % (strands[0], strands[1], brkpts[0], brkpts[1])) if strands[0] != strands[1]: # Inversion # Get discordantly mapped read-pairs utils.log(self.logging_name, 'debug', 'Inversion event identified.') rearr_values['hit'] = True rearr_values['sv_subtype'] = 'inversion' rearr_values['disc_read_count'] = 0 brkpt1 = min(brkpts) brkpt2 = max(brkpts) bp_buffer = 20 for read_pair in disc_reads['inv']: r1p, r2p, r1s, r2s, qname = read_pair if r1s == 1 and r2s == 1: if (r1p <= (brkpt1 + bp_buffer)) and (r2p <= (brkpt2 + bp_buffer) and r2p >= (brkpt1 - bp_buffer)): rearr_values['disc_read_count'] += 1 else: if (r1p <= (brkpt2 + bp_buffer) and r1p >= (brkpt1 - bp_buffer)) and r2p >= (brkpt2 - bp_buffer): rearr_values['disc_read_count'] += 1 elif strands[0] == strands[1]: tgap = brkpts[1] - brkpts[0] qgap = qcoords[1][0] - qcoords[0][1] brkpt1 = min(brkpts) brkpt2 = max(brkpts) bp_buffer = 20 if tgap < 0: utils.log(self.logging_name, 'debug', 'Tandem duplication event identified.') rearr_values['hit'] = True rearr_values['sv_subtype'] = 'tandem_dup' rearr_values['disc_read_count'] = 0 for read_pair in disc_reads['td']: r1p, r2p, r1s, r2s, qname = read_pair disc_read_check = (r1p >= (brkpt1 - bp_buffer) and r1p <= (brkpt2 + bp_buffer)) and (r2p <= (brkpt2 + bp_buffer) and r2p >= (brkpt1 - bp_buffer)) if disc_read_check: rearr_values['disc_read_count'] += 1 elif tgap > qgap: # Gapped deletion from Blast result utils.log(self.logging_name, 'debug', 'Deletion event identified.') rearr_values['hit'] = True rearr_values['sv_type'] = 'indel' rearr_values['indelSize'] = 'D' + str(tgap) else: # Gapped insertion from Blast result utils.log(self.logging_name, 'debug', 'Insertion event identified.') rearr_values['hit'] = True rearr_values['sv_type'] = 'indel' rearr_values['indelSize'] = 'I' + str(qgap) return rearr_values
def check_read_strands(self): same_strand = False strands = [] for read in self.contig_reads: strand = read.id.split("/")[1] strands.append(strand) if len(set(strands)) == 1: same_strand = True utils.log(self.logging_name, 'debug', 'Checking read strands for contig reads %s' % (",".join([read.id for read in self.contig_reads]))) utils.log(self.logging_name, 'debug', 'Reads are on same strand: %r' % same_strand) return same_strand
def filter_rearr(self, query_region, params, brkpts, brkpt_counts, brkpt_kmers, rearr_type, disc_read_count): print 'Breakpoint counts', brkpt_counts print params.opts in_ff, span_ff = utils.filter_by_feature(brkpts, query_region, params.get_param('keep_intron_vars')) filter = (min(brkpt_counts['n']) < params.get_param('rearr_sr_thresh')) or self.br_sorted[0][1] < params.get_param('rearr_minseg_len') or (in_ff and span_ff) or (disc_read_count < 1) or (rearr_type == 'rearrangement') or (min(brkpt_kmers) == 0) utils.log(self.logging_name, 'info' ,'Check filter for rearrangement') utils.log(self.logging_name, 'info', 'Filter by feature for being in exon (%r) or spanning exon (%r)'%(in_ff, span_ff)) utils.log(self.logging_name, 'info', 'Split read threshold %d, breakpoint read counts %d'%(min(brkpt_counts['n']), params.get_param('rearr_minseg_len'))) utils.log(self.logging_name, 'info', 'Minimum segment length observed (%d) meets threshold (%d)'%(self.br_sorted[0][1], params.get_param('rearr_minseg_len'))) utils.log(self.logging_name, 'info', 'Minimum discordant read pairs for rearrangement (%d)'%(disc_read_count)) return filter
def format_rearrangement_values(self, svEvent): """ """ utils.log(self.loggingName, 'info', 'Resolving SVs call from blat results') # Sort the stored blat results by the number of matches to the reference sequence. blatResSorted = sorted(svEvent.blatResults, key=lambda x: x[0]) resultValid = {'valid': True, 'repeatValid': True} maxRepeat = 0.0 self.totalMatching = [] self.repeatOverlapPercent = [] self.realignmentUniqueness = [] self.genes = [] self.alignCigar = [] self.strands = [] self.totalMismatches = [] for i, blatResultTuple in enumerate(blatResSorted): blatResult = blatResultTuple[1] resultValid['valid'] = resultValid['valid'] and blatResult.valid maxRepeat = max(maxRepeat, blatResult.repeat_overlap) self.repeatOverlapPercent.append(blatResult.repeat_overlap) self.realignmentUniqueness.append(blatResult.alignFreq) self.totalMatching.append(blatResult.get_nmatch_total()) self.genes.append(blatResult.get_gene_anno()) self.alignCigar.append(blatResult.cigar) self.strands.append(blatResult.strand) self.totalMismatches.append(blatResult.get_nmatches('mismatch')) svEvent.brkpts.update_brkpt_info(blatResult, i, i == (len(blatResSorted) - 1)) # Sort the blatResultsSorted list by the lowest matching result to the highest matching result svEvent.blatResultsSorted = sorted(svEvent.blatResultsSorted, key=lambda x: x[1]) if svEvent.brkpts.diff_chr(): # translocation event # print 'sv_caller.py format_rearrangement_values(), set trl values', svEvent.contig.meta.id, svEvent.contig.seq svEvent.set_brkpt_counts('trl') self.discReadCount = svEvent.get_disc_read_count() self.svType = 'rearrangement' self.svSubtype = 'trl' self.filterValues.set_trl_values(svEvent) else: svEvent.set_brkpt_counts('rearr') self.svType, self.svSubtype, self.discReadCount = svEvent.define_rearr() self.genes = list(set(self.genes)) self.description = svEvent.rearrDesc self.filterValues.set_rearr_values(svEvent) self.realignmentUniqueness = self.filterValues.realignFreq self.targetName = svEvent.contig.get_target_name() self.fullBreakpointStr = svEvent.get_brkpt_str('all') self.targetBreakpointStr = svEvent.get_brkpt_str('target') self.breakpointCoverageDepth = svEvent.get_brkpt_depths() self.splitReadCount = svEvent.get_splitread_count() self.contigSeq = svEvent.get_contig_seq() self.contigId = svEvent.get_contig_id()
def call_svs(self): """ """ if not self.realignment.has_results(): utils.log(self.loggingName, 'info', 'No blat results file exists, no calls for %s.' % self.contig.get_id()) else: utils.log(self.loggingName, 'info', 'Making variant calls from blat results %s' % self.realignment.get_result_fn()) if self.check_indels(): self.svEvent.format_indel_values() elif self.check_svs(): self.svEvent.format_rearr_values() return self.svEvent
def set_params(self, arguments): """Organize and format all input parameters into class variables to access later. Specific instances of parameters are checked and set. All parameters that are set are logged. The target objects are set along with the paths. Args: arguments (dict): The argparse dictionary object from the command line options. Returns: None Raises: None """ self.parse_opts(arguments) # Parse the config file and command line parameters into the self.opts dictionary. utils.setup_logger(self.get_param('analysis_dir', True), 'breakmer') # Create logging object. utils.log(self.loggingName, 'info', 'Setting up parameters') # Log all parameters passed in, warn for poor paths for paramKey, paramValue in self.opts.items(): utils.log(self.loggingName, 'info', '%s = %s' % (paramKey, paramValue)) self.set_targets() self.paths['ref_data'] = os.path.abspath(os.path.normpath(self.opts['reference_data_dir'])) # Path to target reference sequence fast files. self.set_param('reference_fasta_dir', os.path.split(self.opts['reference_fasta'])[0]) # Path to genome fasta file. # If only preseting the reference data no need to continue. if self.fncCmd == 'prepare_reference_data': self.set_insertsize_thresh() # Set the expected insert size threshold from the properly mapped read pairs. utils.log(self.loggingName, 'info', 'Preset reference data option set! Only the reference data directory will be setup.') return # Setup directories self.paths['analysis'] = os.path.abspath(os.path.normpath(self.opts['analysis_dir'])) self.paths['output'] = os.path.join(self.paths['analysis'], 'output') if 'targets_dir' in self.opts: self.paths['targets'] = os.path.abspath(os.path.normpath(self.opts['targets_dir'])) else: self.paths['targets'] = os.path.join(self.paths['analysis'], 'targets') # Create all the paths. for path in self.paths: utils.log(self.loggingName, 'info', 'Creating %s directory (%s)' % (path, self.paths[path])) if not os.path.exists(self.paths[path]): os.makedirs(self.paths[path]) # If starting the blat server then return. if self.fncCmd == 'start_blat_server': utils.log(self.loggingName, 'info', 'Starting the blat server.') return self.check_binaries() # Check if Jellyfish and Cutadapt work. self.filter = resultfilter.ResultFilter(self.get_param('filterList'), self) # Instantiate the filter class. self.set_insertsize_thresh() # Set the expected insert size threshold from the properly mapped read pairs.
def add_regions(self, regions_bed_fn): ''' ''' region_f = open(regions_bed_fn, 'rU') region_lines = region_f.readlines() for line in region_lines: line = line.strip() chrom, start, end, name = line.split() if name not in self.genes: self.genes[name] = [chrom, int(start), int(end)] utils.log(self.logging_name, 'info', 'Adding in %d other target regions' % len(region_lines))
def write_contig_values(self, contig_id, cluster_fn, target_contig_path): ''' ''' self.contig_id = contig_id self.file_path = os.path.join(target_contig_path, contig_id) utils.log(self.logging_name, 'info', 'Setting up contig path %s' % self.file_path) if not os.path.exists(self.file_path): os.makedirs(self.file_path) self.write_cluster_file(cluster_fn) self.write_read_fq(os.path.join(self.file_path, contig_id + ".fq")) self.write_contig_fa()
def get_seq_complexity(self): """Get the 3-mer complexity of the shortest aligned blat sequence. """ blatResult, nBasesAligned = self.blatResultsSorted[0] alignedSeq = self.contig.seq[blatResult.qstart():blatResult.qend()] merSize = 3 utils.log(self.loggingName, 'debug', 'Checking sequence complexity of blat result segment %s using %d-mers' % (alignedSeq, merSize)) nmers = {} totalMersPossible = len(alignedSeq) - 2 for i in range(len(alignedSeq) - (merSize - 1)): nmers[str(alignedSeq[i:i + merSize]).upper()] = True complexity = round((float(len(nmers)) / float(totalMersPossible)) * 100, 4) utils.log(self.loggingName, 'debug', 'Complexity measure %f, based on %d unique %d-mers observed out of a total of %d %d-mers possible' % (complexity, len(nmers), merSize, totalMersPossible, merSize)) return complexity
def write_output(self): ''' ''' res_fn = os.path.join(self.params.paths['output'], self.params.opts['analysis_name'] + "_svs.out") result_file = open(res_fn, 'w') header = "\t".join(['genes', 'target_breakpoints', 'mismatches', 'strands', 'total_matching', 'sv_type', 'sv_subtype', 'split_read_count', 'disc_read_count', 'breakpoint_coverages', 'contig_id', 'contig_seq']) + "\n" result_file.write(header) for result_str in self.results: utils.log(self.logging_name, 'info', 'Writing results to file: %s' % res_fn) result_file.write(result_str) result_file.close()
def filter_indel(self, svEvent): """ """ indelSizeThresh = int(self.params.get_param('indel_size')) utils.log(self.loggingName, 'info', 'Checking if blat result contains an indel variant') blatResult = svEvent.blatResults[0][1] keep_br = blatResult.valid and blatResult.alignFreq < 2 and blatResult.in_target and (blatResult.indel_maxevent_size[0] >= indelSizeThresh) utils.log(self.loggingName, 'debug', 'Keep blat result %r' % keep_br) # Determine the uniqueness of the realignment. svFilterValues = svEvent.resultValues.filterValues uniqRealignment = svFilterValues.realignFreq < 2 indelSize = svFilterValues.maxEventSize >= indelSizeThresh brkptCoverages = svFilterValues.brkptCoverages[0] >= self.params.get_sr_thresh('indel') minFlankMatches = min(svFilterValues.flankMatchPercents) >= 10.0 if uniqRealignment and indelSize and brkptCoverages and minFlankMatches: utils.log(self.loggingName, 'debug', 'Indel meets basic filtering requirements.') else: utils.log(self.loggingName, 'debug', 'Indel filtered due to non-unique realignment (%r), less than input size threshold (%r), low coverage at breakpoints (%r), or contig edge realignment not long enough (%r), filter status set to True.' % (uniqRealignment, indelSize, brkptCoverages, minFlankMatches)) filterReasons = [] if not uniqRealignment: filterReasons.append('Non-unique realignment (%d) > 2' % svFilterValues.realignFreq) if not indelSize: filterReasons.append('Max indel size (%d) is less than %d' % (svFilterValues.maxEventSize, indelSizeThresh)) if not brkptCoverages: filterReasons.append('Minimum coverage at breakpoints (%d) less than input threshold %d' % (svFilterValues.brkptCoverages[0], self.params.get_sr_thresh('indel'))) if not minFlankMatches: filterReasons.append('Minimum percentage of contig sequence that realigns to the reference to the left or right of the indel event less than 10.0 percent (%d)' % min(svFilterValues.flankMatchPercents)) svEvent.set_filtered(','.join(filterReasons))
def align(self, alignParams, scope): """ """ self.alignParams = alignParams alignProgram, alignExt, alignBinary, binaryParams, alignRef = self.alignParams self.scope = scope self.resultFn = os.path.join(self.contig.get_path(), '%s_res.%s.%s' % (alignProgram, scope, alignExt)) utils.log(self.loggingName, 'info', 'Running realignment with %s, storing results in %s' % (alignProgram, self.resultFn)) cmd = '' if alignProgram == 'blast': cmd = "%s -task 'blastn-short' -db %s -query %s -evalue 0.01 -out %s -outfmt '7 qseqid sseqid pident qlen length mismatch gapopen qstart qend sstart send evalue bitscore gaps sstrand qseq sseq'" % (alignBinary, alignRef, self.contig.meta.fa_fn, self.resultFn) elif alignProgram == 'blat': if scope == 'genome': # all blat server cmd = '%s -t=dna -q=dna -out=psl -minScore=20 -nohead %s %d %s %s %s' % (alignBinary, binaryParams['hostname'], binaryParams['port'], alignRef, self.contig.meta.fa_fn, self.resultFn) elif scope == 'target': # target cmd = '%s -t=dna -q=dna -out=psl -minScore=20 -stepSize=10 -minMatch=2 -repeats=lower -noHead %s %s %s' % (alignBinary, alignRef, self.contig.meta.fa_fn, self.resultFn) utils.log(self.loggingName, 'info', 'Realignment system command %s' % cmd) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) output, errors = p.communicate() utils.log(self.loggingName, 'info', 'Realignment output file %s' % self.resultFn) if errors != '': utils.log(self.loggingName, 'info', 'Realignment errors %s' % errors) if not os.path.isfile(self.resultFn): return False else: self.results = AlignResults(alignProgram, scope, self.resultFn, self.contig, alignRef) return True
def target_hit(self): """ """ if self.hasResults: cond1 = self.results[0].spans_query() and (self.ngaps > 0) cond2 = (len(self.results) == 1) and self.get_query_coverage() >= 90.0 and (self.ngaps > 0) # cond3 = (len(self.results) > 1) and (self.get_query_coverage() >= 90.0) self.targetHit = cond1 or cond2 # or cond3 utils.log(self.loggingName, 'debug', 'Checking if query is a target hit or not %r' % self.targetHit) if self.targetHit: if ((len(self.results) > 1) and (self.get_query_coverage() >= 90.0)) and (self.program == 'blast'): # Check for a gapped Blast result. segments = [] for i, result in enumerate(self.results): resultOverlap = 0 addSegment = True for segment in segments: overlapSeg = (result.qstart() >= segment[0] and result.qstart() <= segment[1]) or (result.qend() >= segment[0] and result.qend() <= segment[1]) containSeg = result.qstart() >= segment[0] and result.qend() >= segment[1] withinSeg = result.qstart() >= segment[0] and result.qend() <= segment[1] # print i, result, containSeg, withinSeg, overlapSeg if containSeg or withinSeg: addSegment = False elif overlapSeg: if (result.qstart() >= segment[0] and result.qstart() <= segment[1]): overlapBp = segment[1] - result.qstart() # print 'Overlapbp', overlapBp if overlapBp < 20: resultOverlap += overlapBp addSegment = True elif (result.qend() >= segment[0] and result.qend() <= segment[1]): overlapBp = result.qend() - segment[0] # print 'Overlapbp', overlapBp if overlapBp < 20: resultOverlap += overlapBp addSegment = True if addSegment and (result.get_query_span() - resultOverlap) > 20: segments.append((result.qstart(), result.qend(), result)) self.targetSegmentsSorted = sorted(segments, key=lambda x: x[0]) for i in range(1, len(self.targetSegmentsSorted)): lResult = self.targetSegmentsSorted[i - 1][2] rResult = self.targetSegmentsSorted[i][2] qgap = rResult.qstart() - lResult.qend() tgap = rResult.tstart() - lResult.tend() if (tgap < 0 and (abs(tgap) > abs(qgap))) or (lResult.strand != rResult.strand): # Tandem dup or inversion break else: self.mergedRecords.append((i - 1, i)) return self.targetHit
def check_read_strands(self, sv_result): ''' ''' same_strand = False strands = [] for read in sv_result.contig.reads: strand = read.id.split("/")[1] strands.append(strand) if len(set(strands)) == 1: same_strand = True utils.log(self.logging_name, 'debug', 'Checking read strands for contig reads %s' % (",".join([read.id for read in sv_result.contig.reads]))) utils.log(self.logging_name, 'debug', 'Reads are on same strand: %r' % same_strand) return same_strand
def resolve_sv_calls(self, contigs, target_ref_fn, target_region_values, disc_reads): ''' ''' sv_results = [] for assembled_contig in contigs: utils.log(self.logging_name, 'info', 'Assessing contig %s' % assembled_contig.seq.value) # pdb.set_trace() realignment_set = self.align_manager.realignment(assembled_contig, target_ref_fn, target_region_values) sv_result = results.SVResult(self.make_call(assembled_contig, target_region_values, realignment_set), self.params.get_param('sample_bam_file'), assembled_contig, target_region_values, disc_reads) if not sv_result.filter: self.filter_manager.filter_result(sv_result) if not sv_result.filter: sv_results.append(sv_result) return sv_results
def make_call(self, contig, region_values, realignment_result_set): ''' ''' sv_event = None if not realignment_result_set.has_results: utils.log(self.logging_name, 'info', 'No blat results file %s, no calls for %s.' % (self.align_manager.query_res_fn, contig.contig_id)) else: utils.log(self.logging_name, 'info', 'Making variant calls from blat results %s' % self.align_manager.query_res_fn) if realignment_result_set.has_indel(): sv_event = realignment_result_set.sv_event # get_indel_result() elif realignment_result_set.check_svs(): sv_event = realignment_result_set.sv_event #bm.get_svs_result() return sv_event
def run(self): """Create and analyze the target regions. The target objects are made and grouped for multiprocessing (if set) and these are all analyzed independently. This is where the analysis starts and ends. Args: None Returns None """ startTime = time.clock() # Track the run time. self.params.start_blat_server() if self.params.fncCmd == 'start_blat_server': print 'Server started!' return targetAnalysisList = self.create_targets() aggResults = {'contigs': [], 'discreads': []} # Buffer the formatted output strings for each target to write out in batch. nprocs = int(self.params.get_param('nprocs')) if nprocs > 1: # Make use of multiprocessing by mapping targets to n jobs. utils.log(self.loggingName, 'info', 'Creating all reference data.') p = multiprocessing.Pool(nprocs) multiprocResults = [] for targetList in targetAnalysisList: multiprocResults.append(p.apply_async(analyze_targets, (targetList, ))) wait(multiprocResults) for multiprocResult in multiprocResults: a = multiprocResult.get() aggResults['contigs'].extend(a['contigs']) aggResults['discreads'].extend(a['discreads']) else: aggResults = analyze_targets(targetAnalysisList) if self.params.fncCmd == 'prepare_reference_data': print 'Reference data setup!' return self.write_aggregated_output(aggResults) utils.log(self.loggingName, 'info', 'Analysis complete in %s' % str(time.clock() - startTime)) if not self.params.get_param('keep_blat_server'): # Keep blat server is specified. cmd = '%s stop %s %d' % (self.params.get_param('gfserver'), self.params.get_param('blat_hostname'), int(self.params.get_param('blat_port'))) os.system(cmd) print 'Analysis complete!'
def write_results(self): ''' ''' res_fn = os.path.join(self.paths['output'], self.name + "_svs.out") result_file = open(res_fn, 'w') header = "\t".join(['genes', 'target_breakpoints', 'mismatches', 'strands', 'total_matching', 'sv_type', 'sv_subtype', 'split_read_count', 'disc_read_count', 'breakpoint_coverages', 'contig_id', 'contig_seq']) + "\n" result_file.write(header) for res in self.results: utils.log(self.logging_name, 'info', 'Writing results to file: %s' % res_fn) formatted_result_str = res.get_output_string() result_file.write(formatted_result_str) self.formatted_results.append(formatted_result_str) result_file.close()
def target_aligned(self): """ """ noAlignmentResults = False targetHit = False if self.results is None: noAlignmentResults = True else: self.results.modify_blat_result_file() if self.results.target_hit(): self.targetHit = True utils.log(self.loggingName, 'debug', 'Top hit contains whole query sequence, indicating an indel variant within the target region.') # If there was a sufficient target hit or no alignment at all then return True # this effectively prevents a genome alignment. return self.targetHit or noAlignmentResults
def filter_rearr(self, svEvent): # in_ff, span_ff = filter_by_feature(brkpts, query_region, params.opts['keep_intron_vars']) # filter = (min(brkpt_counts['n']) < params.get_sr_thresh('rearrangement')) or self.blatResultsSorted[0][1] < params.get_min_segment_length('rearr') or (in_ff and span_ff) or (disc_read_count < 1) or (rearr_type == 'rearrangement') or (min(brkpt_kmers) == 0) svFilterValues = svEvent.resultValues.filterValues # print self.params.get_sr_thresh('rearrangement') # print svFilterValues.brkptCoverages missingQueryCoverage = svFilterValues.missingQueryCoverage < self.params.get_min_segment_length( 'rearr') brkptCoverages = svFilterValues.brkptCoverages[ 0] >= self.params.get_sr_thresh('rearrangement') minSegmentLen = svFilterValues.minSegmentLen >= self.params.get_min_segment_length( 'rearr') # discReadCount = svEvent.resultValues.discReadCount >= 0 minBrkptKmers = svFilterValues.minBrkptKmers > 0 if brkptCoverages and minSegmentLen and minBrkptKmers: utils.log(self.loggingName, 'info', 'Rearrangement meets basic filtering requirements.') else: filteredReasons = [] if not missingQueryCoverage: logMsg = 'No realignment for %d bases in the contig sequence, more than threshold %d.' % ( svFilterValues.missingQueryCoverage, self.params.get_min_segment_length('rearr')) utils.log(self.loggingName, 'info', logMsg) filteredReasons.append(logMsg) if not brkptCoverages: logMsg = 'Minimum coverage at breakpoints (%d) less than input threshold %d.' % ( svFilterValues.brkptCoverages[0], self.params.get_sr_thresh('rearrangement')) utils.log(self.loggingName, 'info', logMsg) filteredReasons.append(logMsg) if not minSegmentLen: logMsg = 'The minimum realigned segment length (%d) is less than the input threshold %d.' % ( svFilterValues.minSegmentLen, self.params.get_min_segment_length('rearr')) utils.log(self.loggingName, 'info', logMsg) filteredReasons.append(logMsg) # if not discReadCount: # logMsg = 'The number of discordant read pairs supporting the event (%d) is less than 1.' % svEvent.resultValues.discReadCount # utils.log(self.loggingName, 'info', logMsg) # filteredReasons.append(logMsg) if not minBrkptKmers: logMsg = 'There were no variant kmers at the one or more of the breakpoint locations.' utils.log(self.loggingName, 'info', logMsg) filteredReasons.append(logMsg) svEvent.set_filtered(','.join(filteredReasons))
def which_rearr(self, varReads, tcoords, qcoords, strands, brkpts): rearrValues = { 'discReadCount': None, 'svType': 'rearrangement', 'svSubType': None, 'hit': False } if not self.check_overlap(tcoords[0], tcoords[1]): utils.log( self.loggingName, 'debug', 'Checking rearrangement svType, strand1 %s, strand2 %s, breakpt1 %d, breakpt %d' % (strands[0], strands[1], brkpts[0], brkpts[1])) if (strands[0] != strands[1]): # and (brkpts[0] < brkpts[1]): # Inversion # Get discordantly mapped read-pairs utils.log(self.loggingName, 'debug', 'Inversion event identified.') rearrValues['hit'] = True rearrValues['svSubType'] = 'inversion' rearrValues['discReadCount'] = varReads.check_inv_readcounts( brkpts) elif (strands[0] == strands[1]): tgap = brkpts[1] - brkpts[0] qgap = qcoords[1][0] - qcoords[0][1] if tgap < 0: utils.log(self.loggingName, 'debug', 'Tandem duplication event identified.') rearrValues['hit'] = True rearrValues['svSubType'] = 'tandem_dup' rearrValues[ 'discReadCount'] = varReads.check_td_readcounts(brkpts) elif tgap > qgap: # Gapped deletion from Blast result utils.log(self.loggingName, 'debug', 'Deletion event identified.') rearrValues['hit'] = True rearrValues['svType'] = 'indel' rearrValues['indelSize'] = 'D' + str(tgap) else: # Gapped insertion from Blast result utils.log(self.loggingName, 'debug', 'Insertion event identified.') rearrValues['hit'] = True rearrValues['svType'] = 'indel' rearrValues['indelSize'] = 'I' + str(qgap) return rearrValues
def set_ref_data(self): ''' ''' # Write rmask bed file if needed. # if not self.params.opts['keep_repeat_regions'] and 'repeat_mask_file' in self.params.opts: # self.logger.info('Extracting repeat mask regions for target gene %s.' % self.name) # self.repeat_mask = setup_rmask(self.get_values(), self.paths['ref_data'], self.params.opts['repeat_mask_file']) # Write reference fasta file if needed. for target_refseq_fn in self.files['target_ref_fn']: direction = "forward" if target_refseq_fn.find("forward") == -1: direction = "reverse" utils.log(self.logging_name, 'info', 'Extracting refseq sequence and writing %s' % target_refseq_fn) utils.extract_refseq_fa(self.get_values(), self.paths['ref_data'], self.params.get_param('reference_fasta'), direction, target_refseq_fn, self.params.get_param('buffer_size'))
def add_path(self, key, path): '''Utility function to create all the output directories. Args: key (str): String value to store the file path value. path (str): File path value. Returns: None Raises: None ''' utils.log(self.logging_name, 'info', 'Creating %s %s path (%s)' % (self.name, key, path)) self.paths[key] = path if not os.path.exists(self.paths[key]): os.makedirs(self.paths[key])
def compare_kmers(self): ''' ''' self.kmers['ref'] = {} jellyfish = self.params.get_param('jellyfish') kmer_size = int(self.params.get_param('kmer_size')) for i in range(len(self.files['target_ref_fn'])): utils.log(self.logging_name, 'info', 'Indexing kmers for reference sequence %s' % self.files['target_ref_fn'][i]) self.kmers['ref'] = utils.load_kmers(utils.run_jellyfish(self.files['target_ref_fn'][i], jellyfish, kmer_size), self.kmers['ref']) # if 'target_altref_fn' in self.files: # for i in range(len(self.files['target_altref_fn'])): # for j in range(len(self.files['target_altref_fn'][i])): # utils.log(self.logging_name, 'info', 'Indexing kmers for reference sequence %s' % self.files['target_altref_fn'][i]) # self.kmers['ref'] = utils.load_kmers(utils.run_jellyfish(self.files['target_altref_fn'][i][j], jellyfish, kmer_size), self.kmers['ref']) utils.log(self.logging_name, 'info', 'Indexing kmers for sample sequence %s' % self.files['sv_cleaned_fq']) self.kmers['case'] = {} self.kmers['case'] = utils.load_kmers(utils.run_jellyfish(self.files['sv_cleaned_fq'], jellyfish, kmer_size), self.kmers['case']) self.kmers['case_sc'] = {} self.kmers['case_sc'] = utils.load_kmers(utils.run_jellyfish(self.files['sv_sc_unmapped_fa'], jellyfish, kmer_size), self.kmers['case_sc']) sc_mers = set(self.kmers['case'].keys()) & set(self.kmers['case_sc']) sample_only_mers = list(sc_mers.difference(set(self.kmers['ref'].keys()))) if 'normal_bam_file' in self.params.opts: norm_kmers = {} norm_kmers = utils.load_kmers(utils.run_jellyfish(self.files['norm_cleaned_fq'], jellyfish, kmer_size), norm_kmers) sample_only_mers = set(sample_only_mers).difference(set(norm_kmers.keys())) sample_only_mers = list(sample_only_mers) # Write case only kmers out to file. self.files['sample_kmers'] = os.path.join(self.paths['kmers'], self.name + "_sample_kmers.out") sample_kmer_fout = open(self.files['sample_kmers'], 'w') self.kmers['case_only'] = {} for mer in sample_only_mers: sample_kmer_fout.write("\t".join([str(x) for x in [mer, str(self.kmers['case'][mer])]]) + "\n") self.kmers['case_only'][mer] = self.kmers['case'][mer] sample_kmer_fout.close() self.kmers['ref'] = {} self.kmers['case'] = {} self.kmers['case_sc'] = {} utils.log(self.logging_name, 'info', 'Writing %d sample-only kmers to file %s' % (len(self.kmers['case_only']), self.files['sample_kmers'])) self.files['kmer_clusters'] = os.path.join(self.paths['kmers'], self.name + "_sample_kmers_merged.out") utils.log(self.logging_name, 'info', 'Writing kmer clusters to file %s' % self.files['kmer_clusters']) self.contigs = assembler.init_assembly(self.kmers['case_only'], self.cleaned_read_recs['sv'], kmer_size, int(self.params.get_param('trl_sr_thresh')), self.params.get_param('read_len')) self.cleaned_read_recs = None self.kmers['case_only'] = {} self.finalize_contigs()
def write_aggregated_output(self, aggregateResults): """Write the SV calls to a top level file in the specified output directory. Header is written at the top of the file if option to remove is not specified. The output files are: <output_dir>/<analysis_name>_svs.out <output_dir>/<analysis_name>_discreads.out Args: aggregateResults (dict): A dictionary containing the formatted output string values. Returns: None """ # Write assembled contig-based SV calls. if len(aggregateResults['contigs']) > 0: allResultFn = os.path.join(self.params.paths['output'], self.params.get_param('analysis_name') + "_svs.all.out") filteredResultFn = os.path.join(self.params.paths['output'], self.params.get_param('analysis_name') + "_svs.out") utils.log(self.loggingName, 'info', 'Writing %s aggregated results files: all result - %s and filtered results - %s' % (self.params.get_param('analysis_name'), allResultFn, filteredResultFn)) allResultFile = open(allResultFn, 'w') filteredResultFile = open(filteredResultFn, 'w') for i, formattedResultStr in enumerate(aggregateResults['contigs']): headerStr, formattedResultValuesStr = formattedResultStr if not self.params.get_param('no_output_header') and i == 0: allResultFile.write(headerStr + '\n') filteredResultFile.write(headerStr + '\n') allResultFile.write(formattedResultValuesStr + '\n') resultValues = formattedResultValuesStr.split('\t') if resultValues[-3] != "True": filteredResultFile.write(formattedResultValuesStr + '\n') allResultFile.close() filteredResultFile.close() # Write discordant read pair clusters. if len(aggregateResults['discreads']) > 0: resultFn = os.path.join(self.params.paths['output'], self.params.get_param('analysis_name') + "_discreads.out") utils.log(self.loggingName, 'info', 'Writing %s aggregated results file %s' % (self.params.get_param('analysis_name'), resultFn)) resultFile = open(resultFn, 'w') for i, formattedResultStr in enumerate(aggregateResults['discreads']): headerStr, formattedResultValuesStr = formattedResultStr if not self.params.get_param('no_output_header') and i == 0: resultFile.write(headerStr + '\n') resultFile.write(formattedResultValuesStr + '\n') resultFile.close()
def call_svs(self): """ """ if not self.realignment.has_results(): utils.log( self.loggingName, 'info', 'No blat results file exists, no calls for %s.' % self.contig.get_id()) else: utils.log( self.loggingName, 'info', 'Making variant calls from blat results %s' % self.realignment.get_result_fn()) if self.check_indels(): self.svEvent.format_indel_values() elif self.check_svs(): self.svEvent.format_rearr_values() return self.svEvent
def missing_query_coverage(self, sv_result): ''' ''' missing_cov = 0 for i in sv_result.query_cov: if i == 0: missing_cov += 1 else: break for i in reversed(sv_result.query_cov): if i == 0: missing_cov += 1 else: break perc_missing = round((float(missing_cov)/float(len(sv_result.contig.seq.value)))*100, 4) utils.log(self.logging_name, 'debug', 'Calculated %f missing coverage of blat query sequence at beginning and end' % perc_missing) return perc_missing
def get_seq_complexity(self): """Get the 3-mer complexity of the shortest aligned blat sequence. """ blatResult, nBasesAligned = self.blatResultsSorted[0] alignedSeq = self.contig.seq[blatResult.qstart():blatResult.qend()] merSize = 3 utils.log( self.loggingName, 'debug', 'Checking sequence complexity of blat result segment %s using %d-mers' % (alignedSeq, merSize)) nmers = {} totalMersPossible = len(alignedSeq) - 2 for i in range(len(alignedSeq) - (merSize - 1)): nmers[str(alignedSeq[i:i + merSize]).upper()] = True complexity = round( (float(len(nmers)) / float(totalMersPossible)) * 100, 4) utils.log( self.loggingName, 'debug', 'Complexity measure %f, based on %d unique %d-mers observed out of a total of %d %d-mers possible' % (complexity, len(nmers), merSize, totalMersPossible, merSize)) return complexity
def add_genes(self, gene_fn): ''' ''' utils.log(self.logging_name, 'info', 'Adding gene annotations from %s' % gene_fn) gene_f = open(gene_fn, 'r') gene_flines = gene_f.readlines() for line in gene_flines[1:]: line = line.strip() linesplit = line.split() chrom = linesplit[2] start = int(linesplit[4]) end = int(linesplit[5]) geneid = linesplit[12] if geneid in self.genes: if start <= self.genes[geneid][1] and end >= self.genes[geneid][2]: self.genes[geneid] = [chrom, start, end] else: self.genes[geneid] = [chrom, start, end] gene_f.close()
def check_svs(self): ''' ''' utils.log(self.logging_name, 'info', 'Checking for SVs') gaps = [(0, self.qsize)] # if len(self.clipped_qs) > 1: utils.log(self.logging_name, 'debug', 'Iterating through %d clipped blat results.' % len(self.realignments)) merged_clip = [0, None] for i, realigned_segment in enumerate(self.realignments): # qs, qe, blat_result, idx = clipped_qs utils.log(self.logging_name, 'debug', 'Blat result with start %d, end %d, chrom %s' % (realigned_segment.qstart(), realigned_segment.qend(), realigned_segment.get_name('hit'))) gaps = self.iter_gaps(gaps, realigned_segment, i) if self.sv_event.qlen > merged_clip[0]: merged_clip = [self.sv_event.qlen, self.sv_event] self.sv_event = merged_clip[1] # else: # utils.log(self.logging_name, 'info', 'There are no more than 1 clipped blat results, not continuing with SVs calling.') # print 'Check valid', self.se_valid() valid_result = self.sv_event is not None and (len(self.sv_event.realignments) > 1) and self.sv_event.in_target if valid_result: self.sv_event.set_event_type('rearrangement') return valid_result
def filter_rearr(self, sv_result): # query_region, params, brkpts, brkpt_counts, brkpt_kmers, rearr_type, disc_read_count): ''' ''' in_ff, span_ff = utils.filter_by_feature(sv_result.breakpoint_values['ref_pos'], sv_result.query_region, self.params.get_param('keep_intron_vars')) match_sorted_realignments = sorted(sv_result.sv_event.realignments, key=lambda x: x.get_nmatch_total()) top_realigned_segment = match_sorted_realignments[0] check1 = (min(sv_result.breakpoint_values['counts']['n']) < self.params.get_param('rearr_sr_thresh')) check2 = top_realigned_segment.get_nmatch_total() < self.params.get_param('rearr_minseg_len') check3 = (in_ff and span_ff) check4 = (sv_result.values['disc_read_count'] < 1) check5 = (sv_result.values['sv_subtype'] == 'NA') check6 = (min(sv_result.breakpoint_values['kmers']) == 0) filter_result = check1 or check2 or check3 or check4 or check5 or check6 utils.log(self.logging_name, 'info', 'Check filter for rearrangement') utils.log(self.logging_name, 'info', 'Filter by feature for being in exon (%r) or spanning exon (%r)' % (in_ff, span_ff)) utils.log(self.logging_name, 'info', 'Split read threshold %d, breakpoint read counts %d' % (min(sv_result.breakpoint_values['counts']['n']), self.params.get_param('rearr_minseg_len'))) utils.log(self.logging_name, 'info', 'Minimum segment length observed (%d) meets threshold (%d)' % (top_realigned_segment.get_nmatch_total(), self.params.get_param('rearr_minseg_len'))) utils.log(self.logging_name, 'info', 'Minimum discordant read pairs for rearrangement (%d)' % (sv_result.values['disc_read_count'])) return filter_result
def iter_gaps(self, gaps, realigned_segment, segment_idx): ''' ''' new_gaps = [] # qs, qe, br, idx = cq segment_start = realigned_segment.qstart() segment_end = realigned_segment.qend() hit = False for gap in gaps: gap_start, gap_end = gap utils.log(self.logging_name, 'debug', 'Gap coords %d, %d' % (gap_start, gap_end)) # Options to consider when to add segment to reconstruction start_within_gap = (segment_start >= gap_start and segment_start <= gap_end) end_within_gap = (segment_end <= gap_end and segment_end >= gap_start) gap_edge_dist_start = (segment_start <= gap_start) and ((gap_start - segment_start) < 15) gap_edge_dist_end = (segment_end >= gap_end) and ((segment_end - gap_end) < 15) opt1 = (gap_edge_dist_start and (end_within_gap or gap_edge_dist_end)) opt2 = (gap_edge_dist_end and (start_within_gap or gap_edge_dist_start)) if start_within_gap or end_within_gap or opt1 or opt2: ngap = [] if segment_start > gap_start: if (segment_start-1 - gap_start) > 10: ngap.append((gap_start, segment_start-1)) if segment_end < gap_end: if (gap_end - segment_end+1) > 10: ngap.append((segment_end + 1, gap_end)) if segment_idx == 0: utils.log(self.logging_name, 'debug', 'Creating SV event from blat result with start %d, end %d'%(segment_start, segment_end)) self.sv_event = results.SVEvent(realigned_segment) #, self.meta_dict['query_region'], self.meta_dict['contig_vals'], self.meta_dict['sbam']) new_gaps.extend(ngap) hit = True elif self.check_add_br(segment_start, segment_end, gap_start, gap_end, realigned_segment): utils.log(self.logging_name, 'debug', 'Adding blat result to event') new_gaps.extend(ngap) self.sv_event.add(realigned_segment) hit = True else: new_gaps.append(gap) else: new_gaps.append(gap) utils.log(self.logging_name, 'debug', 'New gap coords %s'%(",".join([str(x) for x in new_gaps]))) if not hit: self.sv_event.check_previous_add(realigned_segment) return new_gaps
def iter_gaps(self, gaps, clippedQuerySeqVals, iterIdx): """ """ new_gaps = [] qs, qe, blatResult, idx = clippedQuerySeqVals hit = False for gap in gaps: gs, ge = gap utils.log(self.loggingName, 'debug', 'Gap coords %d, %d' % (gs, ge)) startWithinGap = (qs >= gs and qs <= ge) endWithinGap = (qe <= ge and qe >= gs) gapEdgeDistStart = (qs <= gs) and ((gs - qs) < 15) gapEdgeDistEnd = (qe >= ge) and ((qe - ge) < 15) if startWithinGap or endWithinGap or ( gapEdgeDistStart and (endWithinGap or gapEdgeDistEnd)) or ( gapEdgeDistEnd and (startWithinGap or gapEdgeDistStart)): ngap = [] if qs > gs: if (qs - 1 - gs) > 10: ngap.append((gs, qs - 1)) if qe < ge: if (ge - qe + 1) > 10: ngap.append((qe + 1, ge)) if iterIdx == 0: utils.log( self.loggingName, 'debug', 'Creating SV event from blat result with start %d, end %d' % (qs, qe)) self.svEvent = SVEvent(blatResult, self.contig, 'rearrangement') new_gaps.extend(ngap) hit = True elif self.check_add_br(qs, qe, gs, ge, blatResult): utils.log(self.loggingName, 'debug', 'Adding blat result to event') new_gaps.extend(ngap) self.svEvent.add(blatResult) hit = True else: new_gaps.append(gap) else: new_gaps.append(gap) utils.log( self.loggingName, 'debug', 'New gap coords %s' % (",".join([str(x) for x in new_gaps]))) if not hit: self.svEvent.check_previous_add(blatResult) return new_gaps
def get_startend_missing_query_coverage(self): """Calculate the percentage of the contig sequence that is not realigned to the reference, only examining the beginning and end of the contig sequence. """ missingCov = 0 for i in self.queryCoverage: if i == 0: missingCov += 1 else: break for i in reversed(self.queryCoverage): if i == 0: missingCov += 1 else: break percentMissing = round( (float(missingCov) / float(len(self.contig.seq))) * 100, 4) utils.log( self.loggingName, 'debug', 'Calculated %f missing coverage of blat query sequence at beginning and end' % percentMissing) return percentMissing
def extract_bam_reads(self, sampleType): """Wrapper for Variation extract_bam_reads function. Args: sampleType (str): Indicates a tumor ('sv') or normal ('norm') sample being processed. Return: None """ # Create the file paths for the files that will be created from the read extraction. self.variation.setup_read_extraction_files(sampleType, self.paths['data'], self.name) bamType = 'sample' if sampleType == 'norm': bamType = 'normal' bamFile = self.params.get_param('%s_bam_file' % bamType) utils.log( self.loggingName, 'info', 'Extracting bam reads from %s to %s' % (bamFile, self.variation.files['%s_fq' % sampleType])) self.variation.set_var_reads(sampleType, bamFile, self.chrom, self.start, self.end, self.regionBuffer)
def check_add_br(self, qs, qe, gs, ge, br): ''' ''' utils.log(self.logging_name, 'info', 'Checking to add blat result with start %d, end %d' % (qs, qe)) add = False over_perc = round((float(min(qe, ge) - max(qs, gs)) / float(qe - qs)) * 100) # Calc % of segment overlaps with gap ov_right = 0 # Check overlap with other aligned segments if qe > ge: ov_right = abs(qe - ge) ov_left = 0 if qs < gs: ov_left = abs(qs - gs) br.set_segment_overlap(ov_left, ov_right) max_seg_overlap = max(ov_right, ov_left) utils.log(self.logging_name, 'debug', 'Blat query segment overlaps gap by %f' % over_perc) utils.log(self.logging_name, 'debug', 'Max segment overlap %f' % max_seg_overlap) utils.log(self.logging_name, 'debug', 'Event in target %r and blat result in target %r' % (self.sv_event.in_target, br.in_target)) if over_perc >= 50 and (max_seg_overlap < 15 or (br.in_target and self.sv_event.in_target)): # and (self.se.in_target or br.in_target): add = True utils.log(self.logging_name, 'debug', 'Add blat result to SV event %r' % add) return add
def get_brkpt_counts_filt(self, brkpts, sv_type): ''' ''' avg_comp, comp_vec = utils.calc_contig_complexity(self.contig.seq.value) brkpt_rep_filt = False brkpt_counts = {'n':[],'d':[],'b':[]} brkpt_kmers = [] for qb in brkpts['q'][1]: left_idx = qb[0] - min(qb[1],5) right_idx = qb[0] + min(qb[2],5) bc = self.contig.get_contig_counts().get_counts(left_idx, right_idx, sv_type) brkpt_counts['n'].append(min(bc)) brkpt_counts['d'].append(min(self.contig.get_contig_counts().get_counts((qb[0]-1), (qb[0]+1), sv_type))) brkpt_counts['b'].append(self.contig.get_contig_counts().get_counts(qb[0], qb[0], sv_type)) brkpt_kmers.append(self.contig.get_kmer_locs()[qb[0]]) brkpt_rep_filt = brkpt_rep_filt or (comp_vec[qb[0]] < (avg_comp/2)) utils.log(self.logging_name, 'debug', 'Read count around breakpoint %d: %s'%(qb[0],",".join([str(x) for x in bc]))) utils.log(self.logging_name, 'debug', 'Kmer count around breakpoints %s'%(",".join([str(x) for x in brkpt_kmers]))) brkpt_rep_filt = brkpt_rep_filt or (len(filter(lambda x: x, brkpts['f'])) > 0) return brkpt_counts, brkpt_kmers, brkpt_rep_filt
def has_indel(self): ''' ''' has_indel = False for i, realigned_segment in enumerate(self.realignments): # br.mean_cov = self.get_mean_cov(br.qstart(), br.qend()) #keep_clipped = (mean_cov<4 and ((br.get_nmatch_total()<30 and not br.in_repeat) or br.get_nmatch_total()>=30)) #keep_clipped = keep_clipped or (br.in_target and mean_cov<10) #print nmatch, ngaps, br.mean_cov # if i == 0 and self.check_segment_indel(realigned_segment): if i == 0: if realigned_segment.has_gaps() and (realigned_segment.spans_query() or (len(self.realignments) == 1 and realigned_segment.in_target)): has_indel = True utils.log(self.logging_name, 'info', 'Contig has indel, returning %r' % has_indel) self.sv_event = results.SVEvent(realigned_segment) self.sv_event.set_event_type('indel') return has_indel # else: # utils.log(self.logging_name, 'debug', 'Storing clipped blat result start %d, end %d' % (realigned_segment.qstart(), realigned_segment.qend())) # self.clipped_qs.append((br.qstart(), br.qend(), br, i)) utils.log(self.logging_name, 'info', 'Contig does not have indel, return %r' % has_indel) return has_indel
def get_param(self, key, required=False): """Get the parameter value in the self.opts dictionary. If the parameer is required to be availale, then exit the program and throw an error. Args: key (str): The key in the opts dictionary to access the parameter value. required: Boolean value to indicate if the key should be required to be in the dictionary or not. Returns: value (int, str, boolean): The value of the parameter if it is found. If the parameter is required and not found the program will exit with error. If the parameter is not required and not found, it will return None. Raises: None """ value = None if key in self.opts: value = self.opts[key] elif required: utils.log(self.logging_name, 'error', 'Missing required parameter %s, exiting.' % key) sys.exit(1) return value
def filter_indel(self, svEvent): """ """ indelSizeThresh = int(self.params.get_param('indel_size')) utils.log(self.loggingName, 'info', 'Checking if blat result contains an indel variant') blatResult = svEvent.blatResults[0][1] keep_br = blatResult.valid and blatResult.alignFreq < 2 and blatResult.in_target and ( blatResult.indel_maxevent_size[0] >= indelSizeThresh) utils.log(self.loggingName, 'debug', 'Keep blat result %r' % keep_br) # Determine the uniqueness of the realignment. svFilterValues = svEvent.resultValues.filterValues uniqRealignment = svFilterValues.realignFreq < 2 indelSize = svFilterValues.maxEventSize >= indelSizeThresh brkptCoverages = svFilterValues.brkptCoverages[ 0] >= self.params.get_sr_thresh('indel') minFlankMatches = min(svFilterValues.flankMatchPercents) >= 10.0 if uniqRealignment and indelSize and brkptCoverages and minFlankMatches: utils.log(self.loggingName, 'debug', 'Indel meets basic filtering requirements.') else: utils.log( self.loggingName, 'debug', 'Indel filtered due to non-unique realignment (%r), less than input size threshold (%r), low coverage at breakpoints (%r), or contig edge realignment not long enough (%r), filter status set to True.' % (uniqRealignment, indelSize, brkptCoverages, minFlankMatches)) filterReasons = [] if not uniqRealignment: filterReasons.append('Non-unique realignment (%d) > 2' % svFilterValues.realignFreq) if not indelSize: filterReasons.append( 'Max indel size (%d) is less than %d' % (svFilterValues.maxEventSize, indelSizeThresh)) if not brkptCoverages: filterReasons.append( 'Minimum coverage at breakpoints (%d) less than input threshold %d' % (svFilterValues.brkptCoverages[0], self.params.get_sr_thresh('indel'))) if not minFlankMatches: filterReasons.append( 'Minimum percentage of contig sequence that realigns to the reference to the left or right of the indel event less than 10.0 percent (%d)' % min(svFilterValues.flankMatchPercents)) svEvent.set_filtered(','.join(filterReasons))
def multiple_genes(self, chrs, brkpts, anno_genes): ''' ''' mult_genes = True if len(set(anno_genes)) == 1: utils.log(self.logging_name, 'debug', 'One annotated gene among SVs breakpoints: %s' % ",".join(anno_genes)) mult_genes = False elif self.dup_gene_names(anno_genes) and len(set(chrs)) == 1 and ((max(brkpts) - min(brkpts)) < 10000): utils.log(self.logging_name, 'debug', 'Anno genes are not the same, but similar and breakpoints are less than 10Kb from each other %s' % ",".join(anno_genes)) mult_genes = False utils.log(self.logging_name, 'debug', 'Test whether SVs breakpoints are in multiple genes %r' % mult_genes) return mult_genes
def set_ref_data(self): """Write the reference sequence to a fasta file for this specific target if it does not exist. Args: None Returns: None Raise: None """ # Write reference fasta file if needed. for i in range(len(self.files['target_ref_fn'])): fn = self.files['target_ref_fn'][i] direction = "forward" if fn.find("forward") != -1 else "reverse" utils.log(self.loggingName, 'info', 'Extracting refseq sequence and writing %s' % fn) utils.extract_refseq_fa(self.values, self.paths['ref_data'], self.params.get_param('reference_fasta'), direction, fn) # If using blatn for target realignment, the db must be available. blastn = self.params.get_param('blast') if blastn is not None: # Check if blast db files are available for each target. if not os.path.isfile(self.files['target_ref_fn'][0] + '.nin'): makedb = os.path.join(os.path.split(blastn)[0], 'makeblastdb') # Create blast db cmd = "%s -in %s -dbtype 'nucl' -out %s" % ( makedb, self.files['target_ref_fn'][0], self.files['target_ref_fn'][0]) utils.log( self.loggingName, 'info', 'Creating blast db files for target %s with reference file %s' % (self.name, self.files['target_ref_fn'][0])) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) output, errors = p.communicate() if errors != '': utils.log( self.loggingName, 'debug', 'Failed to make blast db files using reference file %s' % self.files['target_ref_fn'][0])