Esempio n. 1
0
    def check_target_blat(self):

        '''
        '''

        # am = sv_caller.align_manager(meta_dict)
        # hit, self.query_res_fn = am.check_target_results()

        hit = False
        utils.log(self.logging_name, 'info', 'Checking if target blat contains most of query or if whole genome needs to queried.')
         
        if not self.realign_results.has_results: 
            self.query_res_fn = None
            hit = True
        else:
            self.realign_results.write_mod_result_file(self.query_res_fn + '.mod')
            #print 'Checking target results'
            #print len(self.bm.blat_results)
            #print self.bm.get_query_coverage()
            #print self.bm.nmismatches
            #print self.bm.ngaps
            if self.realign_results.target_hit():
                hit = True
                utils.log(self.logging_name, 'debug', 'Top hit contains whole query sequence, indel variant')
                self.query_res_fn += '.mod'
    #      else:
    #        if max(contig_counts.others) < params.get_sr_thresh('trl'):
    #          hit = True
    #          self.query_res_fn = None
        return hit # , self.query_res_fn
Esempio n. 2
0
    def set_reference_kmers(self, targetRefFns):
        """Set the reference sequence kmers"""

        self.kmers['ref'] = {}
        for i in range(len(targetRefFns)):
            utils.log(self.loggingName, 'info', 'Indexing kmers for reference sequence %s' % targetRefFns[i])
            self.get_kmers(targetRefFns[i], self.kmers['ref'])
Esempio n. 3
0
 def which_rearr(self, varReads, tcoords, qcoords, strands, brkpts):
     rearrValues = {'discReadCount': None, 'svType': 'rearrangement', 'svSubType': None, 'hit': False}
     if not self.check_overlap(tcoords[0], tcoords[1]):
         utils.log(self.loggingName, 'debug', 'Checking rearrangement svType, strand1 %s, strand2 %s, breakpt1 %d, breakpt %d' % (strands[0], strands[1], brkpts[0], brkpts[1]))
         if (strands[0] != strands[1]): # and (brkpts[0] < brkpts[1]):
             # Inversion
             # Get discordantly mapped read-pairs
             utils.log(self.loggingName, 'debug', 'Inversion event identified.')
             rearrValues['hit'] = True
             rearrValues['svSubType'] = 'inversion'
             rearrValues['discReadCount'] = varReads.check_inv_readcounts(brkpts)
         elif (strands[0] == strands[1]):
             tgap = brkpts[1] - brkpts[0]
             qgap = qcoords[1][0] - qcoords[0][1]
             if tgap < 0:
                 utils.log(self.loggingName, 'debug', 'Tandem duplication event identified.')
                 rearrValues['hit'] = True
                 rearrValues['svSubType'] = 'tandem_dup'
                 rearrValues['discReadCount'] = varReads.check_td_readcounts(brkpts)
             elif tgap > qgap:
                 # Gapped deletion from Blast result
                 utils.log(self.loggingName, 'debug', 'Deletion event identified.')
                 rearrValues['hit'] = True
                 rearrValues['svType'] = 'indel'
                 rearrValues['indelSize'] = 'D' + str(tgap)
             else:
                 # Gapped insertion from Blast result
                 utils.log(self.loggingName, 'debug', 'Insertion event identified.')
                 rearrValues['hit'] = True
                 rearrValues['svType'] = 'indel'
                 rearrValues['indelSize'] = 'I' + str(qgap)
     return rearrValues
Esempio n. 4
0
    def resolve_sv(self):

        '''
        '''

        utils.log(self.logging_name, 'info', 'Resolving structural variants from %d kmer clusters' % len(self.contigs))
        self.results = self.call_manager.resolve_sv_calls(self.contigs, self.files['target_ref_fn'][0], self.get_values(), self.disc_reads)
Esempio n. 5
0
    def set_values(self, contigId, params, queryRegionValues, contigPath, readVariation):
        """Sets the contig values after contig has been compeleted and ready for
        realignment.
        Args:
            contigId:           String containing contid ID.
            params:             Param object.
            queryRegionValues:  Tuple containing the target region information
            contigPath:         String of the path to the contig directory to store files.
        Return: None
        """
        self.params = params
        self.id = contigId
        self.readVariation = readVariation
        self.chr = queryRegionValues[0]
        self.start = int(queryRegionValues[1])
        self.end = int(queryRegionValues[2])
        self.targetName = queryRegionValues[3]
        self.regionBuffer = queryRegionValues[5]
        self.path = os.path.join(contigPath, self.id)
        logger = logging.getLogger('breakmer.assembly.contig')
        utils.log(self.loggingName, 'info', 'Setting up contig path %s' % self.path)

        if not os.path.exists(self.path):
            os.makedirs(self.path)
        self.fq_fn = os.path.join(contigPath, self.id, self.id + '.fq')
        self.fa_fn = os.path.join(contigPath, self.id, self.id + '.fa')
Esempio n. 6
0
 def check_indel(self, nBlatResults):
     indel = False
     utils.log(self.loggingName, 'info', 'Checking if blat result contains an indel variant')
     if (self.spans_query() or (nBlatResults == 1 and self.in_target)) and (self.ngaps > 0):
         utils.log(self.loggingName, 'info', 'Blat result spans query (%r) or only one blat result (%r) and blat result in target (%r)' % (self.spans_query(), (nBlatResults == 1), self.in_target))
         indel = True
     return indel
Esempio n. 7
0
  def get_brkpt_counts_filt(self, brkpts, sv_type):
#    print 'Contig seq', self.contig_seq
#    print 'Breakpoint simple repeat filter', brkpts['f'] 
    avg_comp, comp_vec = utils.calc_contig_complexity(self.contig_seq) 
#    print 'Contig avg complexity', avg_comp
#    print 'Contig complexity vec', comp_vec
    brkpt_rep_filt = False
    brkpt_counts = {'n':[],'d':[],'b':[]}
    brkpt_kmers = []
    for qb in brkpts['q'][1]:
      left_idx = qb[0] - min(qb[1],5)
      right_idx = qb[0] + min(qb[2],5)
#      print self.contig_rcounts.others
#      print self.contig_rcounts.indel_only
#      print qb[0], left_idx, right_idx
      bc = self.contig_rcounts.get_counts(left_idx, right_idx, sv_type)
      brkpt_counts['n'].append(min(bc))
      brkpt_counts['d'].append(min(self.contig_rcounts.get_counts((qb[0]-1), (qb[0]+1), sv_type)))
#      print 'Others counts', self.contig_rcounts.others, qb[0]
#      print 'Indel only counts', self.contig_rcounts.indel_only, qb[0]
      brkpt_counts['b'].append(self.contig_rcounts.get_counts(qb[0], qb[0], sv_type))
      brkpt_kmers.append(self.contig_kmer_locs[qb[0]])
#      print 'Breakpoint in contig', qb[0]
      brkpt_rep_filt = brkpt_rep_filt or (comp_vec[qb[0]] < (avg_comp/2))
#      print 'Breakpoint rep filter', brkpt_rep_filt, comp_vec[qb[0]]
      utils.log(self.logging_name, 'debug', 'Read count around breakpoint %d: %s'%(qb[0],",".join([str(x) for x in bc])))
    utils.log(self.logging_name, 'debug', 'Kmer count around breakpoints %s'%(",".join([str(x) for x in brkpt_kmers])))
    brkpt_rep_filt = brkpt_rep_filt or (len(filter(lambda x: x, brkpts['f'])) > 0)
    return brkpt_counts, brkpt_kmers, brkpt_rep_filt
Esempio n. 8
0
def analyze_targets(targetList):
    """Analyze a list of targets.

    A list of TargetManager objects are passed in to be analyzed independently.
    Each target ref data is set, if necessary, then the reads are extracted,
    contigs built, and calls made.

    This function performs all the top level functions on the target regions being analyzed.

    Args:
        targetList (list):          A list of TargetManager objects, representing target regions.
    Returns:
        aggregateResults (dict):    A dictoinary containing lists of formatted output strings for the
                                    contig-based calls and the discordant-only read clusters.
    Raises:
        None
    """

    aggregateResults = {'contigs': [], 'discreads': []}  # Formatted output strings for contig based calls and discordant read calls are different.
    for targetRegion in targetList:
        # print 'Analyzing', targetRegion.name
        utils.log('breakmer.processor.analysis', 'info', 'Analyzing %s' % targetRegion.name)
        targetRegion.set_ref_data()
        if targetRegion.fnc == 'prepare_reference_data':  # Stop here if only preparing ref data.
            continue
        if not targetRegion.find_sv_reads():  # No SV reads extracted. Exiting.
            continue
        targetRegion.compare_kmers()  # Perform kmer subtraction.
        targetRegion.resolve_sv()  # Assemble extracted reads and make calls.
        if targetRegion.has_results():
            outputs = targetRegion.get_formatted_output()
            for key in outputs:
                aggregateResults[key].extend(outputs[key])
        targetRegion.complete_analysis()  # Write results out to file.
    return aggregateResults
Esempio n. 9
0
    def realignment(self, contig, target_ref_fa_fn, target_region_values):

        '''
        '''

        if contig.contig_fa_fn is None:
            return

        self.query_res_fn = os.path.join(contig.file_path, 'blat_res.target.psl')
        realign_dict = {'binary': self.params.get_param('blat'),
                        'database': target_ref_fa_fn
                       }
        utils.run_blat(realign_dict, self.query_res_fn, contig.contig_fa_fn, 'target') # Run blat against target reference sequence first for speed.

        if self.query_res_fn is None:
            utils.log(self.logging_name, 'info', 'No blat results file %s, no calls for %s.' % (self.query_res_fn, contig.contig_id))
            return
        self.realign_results = RealignResultSet(self.params, self.query_res_fn, target_region_values, 'target')
        if not self.check_target_blat():
            # Blat against whole genome reference fasta
            realign_dict = {'binary': self.params.get_param('gfclient'),
                            'blat_port': self.params.get_param('blat_port'),
                            'database': self.params.get_param('reference_fasta_dir')
                           }
            self.query_res_fn = os.path.join(contig.file_path, 'blat_res.genome.psl')
            utils.run_blat(realign_dict, self.query_res_fn, contig.contig_fa_fn, 'genome')
            self.realign_results = RealignResultSet(self.params, self.query_res_fn, target_region_values, 'genome')
        return self.realign_results
Esempio n. 10
0
  def target_hit(self):
    #single_res = (len(self.blat_results)==1) and (self.get_query_coverage() >= 90.0)
    #mult_res1 = (len(self.blat_results)>1 and (self.get_query_coverage()>=95.0) and (self.nmismatches<5) and (self.ngaps<3))
    #mult_res2 = self.blat_results[0][3].spans_query() and (self.blat_results[0][3].get_nmatches('mis')<5) and (self.blat_results[0][3].get_num_gaps()<3)
    # Single hit with 90% identity.
    indel_hit = self.blat_results[0][3].spans_query() or (len(self.blat_results) == 1 and self.get_query_coverage() >= 90.0)
#    single_res = (len(self.blat_results)==1) and (self.get_query_coverage() >= 90.0)
#    mult_res1 = False
#    mult_res2 = False
    # Or multiple hits
#    if len(self.blat_results) > 1:
#      mult_res1 = self.blat_results[0][3].get_query_coverage() >= 90.0
#      print 'blat percent ident', self.blat_results[0][3].perc_ident
#      qcov = [0]*self.qsize
#      for br in self.blat_results:
        # Calculate the total query coverage for blat results with 95% identity
#        print 'Multiple hit', br[3].perc_ident
#        if br[3].perc_ident >= 95.0:
#          qcov[br[3].qstart():br[3].qend()] = map(lambda x: x+1, qcov[br[3].qstart():br[3].qend()])
#      nhits = 0
#      for i in qcov:
#        if i>0: nhits += 1
#      total_query_cov = round((float(nhits)/float(self.qsize))*100,2) 
#      mult_res2 = total_query_cov >= 95.0
#    sys.exit()
    utils.log(self.logging_name, 'debug', 'Checking if query is a target hit or not %r' % indel_hit)
    return indel_hit #single_res or mult_res1 or mult_res2
Esempio n. 11
0
    def set_params(self):

        """Organize and format all input parameters into class variables to access
        later. Specific instances of parameters are checked and set. All the parameters that are
        set are logged. The target objects are set along with the paths.

        Args:
            None
        Returns:
            None
        Raises:
            None
        """

        log_msgs = self.parse_opts()  # Parse the config file and command line parameters into the self.opts dictionary.
        utils.setup_logger(self.get_param('analysis_dir', True), 'breakmer')  # Create logging object.
        # Log the parameter setup after the logging object is created.
        utils.log(self.logging_name, 'info', 'Setting up parameters')

        # Setup the logger first before checking required params - log if there is a missing parameter.
        self.check_required_params()

        # Log all parameters passed in, warn for poor paths
        for param_key, param_value in self.opts.items():
            utils.log(self.logging_name, 'info', '%s = %s' % (param_key, param_value))

        # Log parameter overwritten by configuration file input values.
        for log_msg in log_msgs:
            utils.log(self.logging_name, log_msg[0], log_msg[1])

        self.set_targets()
        self.gene_annotations = Anno()
        self.gene_annotations.add_genes(self.get_param('gene_annotation_file'))

        self.paths['ref_data'] = os.path.abspath(os.path.normpath(self.opts['reference_data_dir']))  # Path to target reference sequence fast files.
        self.set_param('reference_fasta_dir', os.path.split(self.opts['reference_fasta'])[0])  # Path to genome fasta file.

        # Setup directories
        self.paths['analysis'] = os.path.abspath(os.path.normpath(self.opts['analysis_dir']))
        self.paths['output'] = os.path.join(self.paths['analysis'], 'output')
        if 'targets_dir' in self.opts:
            self.paths['targets'] = os.path.abspath(os.path.normpath(self.opts['targets_dir']))
        else:
            self.paths['targets'] = os.path.join(self.paths['analysis'], 'targets')

        # Create all the paths.
        for path in self.paths:
            utils.log(self.logging_name, 'info', 'Creating %s directory (%s)' % (path, self.paths[path]))
            if not os.path.exists(self.paths[path]):
                os.makedirs(self.paths[path])

        # If starting the blat server then return.
        if self.fnc_cmd == 'start_blat_server':
            utils.log(self.logging_name, 'info', 'Starting the blat server.')
            return

        self.check_binaries()  # Check if Jellyfish and Cutadapt work.
        # self.filter = resultfilter.ResultFilter(self.get_param('filterList'), self)  # Instantiate the filter class.
        self.set_insertsize_thresh()  # Set the expected insert size threshold from the properly mapped read
Esempio n. 12
0
 def minseq_complexity(self, seq, N):
   utils.log(self.logging_name, 'debug', 'Checking sequence complexity of blat result segment %s using %d-mers' % (seq, N))
   nmers = {}
   total_possible = len(seq) - 2
   for i in range(len(seq) - (N - 1)):
     nmers[str(seq[i:i+N]).upper()] = True
   complexity = round((float(len(nmers))/float(total_possible))*100,4)
   utils.log(self.logging_name, 'debug', 'Complexity measure %f, based on %d unique %d-mers observed out of a total of %d %d-mers possible' % (complexity, len(nmers), N, total_possible, N))
   return complexity
Esempio n. 13
0
    def set_sample_kmers(self):
        """Set the sample kmers
        """

        utils.log(self.loggingName, 'info', 'Indexing kmers for sample sequence %s' % self.files['sv_cleaned_fq'])
        self.kmers['case'] = {}
        self.kmers['case_sc'] = {}
        self.get_kmers(self.files['sv_cleaned_fq'], self.kmers['case'])
        self.get_kmers(self.files['sv_sc_unmapped_fa'], self.kmers['case_sc'])
Esempio n. 14
0
    def get_target_intervals(self, targetName):
        """Return the stored intervals for a specific target.
        """

        if targetName in self.targets:
            return self.targets[targetName]
        else:
            utils.log(self.loggingName, 'debug', '%s target name not in target dictionary.' % targetName)
            sys.exit(1)
Esempio n. 15
0
    def finalize_contigs(self):

        '''
        '''

        utils.log(self.logging_name, 'info', 'Finalizing %d assembled contigs' % len(self.contigs))
        for contig_iter, assembled_contig in enumerate(self.contigs):
            utils.log(self.logging_name, 'info', 'Finalizing contig %s' % assembled_contig.seq.value)
            contig_id = self.name + '-contig' + str(contig_iter + 1)
            assembled_contig.write_contig_values(contig_id, self.files['kmer_clusters'], self.paths['contigs'])
Esempio n. 16
0
    def write_result(self, svEventResult, outputPath):
        resultFn = os.path.join(self.path, self.id + "_svs.out")
        utils.log(self.loggingName, 'info', 'Writing %s result file %s' % (self.id, resultFn))
        resultFile = open(resultFn, 'w')

        # A string of output values for writing to file.
        headerStr, formattedResultValuesStr = svEventResult.get_formatted_output_values()
        resultFile.write(headerStr + '\n' + formattedResultValuesStr + '\n')
        resultFile.close()
        shutil.copyfile(resultFn, os.path.join(outputPath, self.id + "_svs.out"))
Esempio n. 17
0
    def set_targets(self):

        """Parse the targets bed file and store them in a dictionary. Limit to a gene
        list if input.

        A list of genes can be passed in by the user to limit the analysis. This will
        limit which targets are stored in the dictionary as the target bed file is parsed.
        The target bed file is a tab-delimited text file that should have at minimum,
        four columns (chromosome, start, end, name) with an optional fourth column
        containing a coding feature (i.e., exon or intron). Each row is either a tiled
        region with sequencing coverage or it is just a region to analyze by BreaKmer.
        
        The name can be applied to multiple rows, and if multiple tiled regions are input
        with the same name they are aggregated together under the same key.
        
        Store the target information in the self.target dictionary with the name as the key
        and a list of tuples of interval genomic locations as the values.

        self.target[gene_name] = [(chrom, start_bp, end_bp, name, feature),...]

        Args:
            None
        Returns:
            None
        Raises:
            None
        """

        # Get the gene list file path if it exists.
        gene_list = self.get_param('gene_list')
        region_list = None
        if gene_list:
            region_list = []
            # Each line contains a gene name.
            for line in open(gene_list, 'r'):
                region_list.append(line.strip().upper())

        utils.log(self.logging_name, 'info', 'Parsing target list')

        # TODO: Check to make sure there aren't duplicate genes.
        with open(self.get_param('targets_bed_file'), 'rU') as target_bed_file:
            for target in target_bed_file:
                # Each target is formatted like a bed, chr bp1 bp2 name
                target = target.strip()
                targetsplit = target.split()
                chrm, bp1, bp2, name = targetsplit[0:4]
                if region_list:
                    if name.upper() not in region_list:
                        continue
                # Allow a fifth column containing indication of what type of region it is.
                # Typically exon/intron designation. This will be deprecated.
                feature = None if len(targetsplit) <= 4 else targetsplit[4]
                self.targets.setdefault(name.upper(), [])
                self.targets[name.upper()].append((chrm, int(bp1), int(bp2), name, feature))
        utils.log(self.logging_name, 'info', '%d targets' % len(self.targets))
Esempio n. 18
0
    def which_rearr(self, disc_reads, brkpts, strands, tcoords, qcoords):

        '''
        '''

        rearr_values = {'disc_read_count': None, 'sv_type': 'rearrangement', 'sv_subtype': None, 'hit': False}
        if not self.check_overlap(tcoords[0], tcoords[1]):
            utils.log(self.logging_name, 'debug', 'Checking rearrangement svType, strand1 %s, strand2 %s, breakpt1 %d, breakpt %d' % (strands[0], strands[1], brkpts[0], brkpts[1]))
            if strands[0] != strands[1]:
                # Inversion
                # Get discordantly mapped read-pairs
                utils.log(self.logging_name, 'debug', 'Inversion event identified.')
                rearr_values['hit'] = True
                rearr_values['sv_subtype'] = 'inversion'
                rearr_values['disc_read_count'] = 0
                brkpt1 = min(brkpts)
                brkpt2 = max(brkpts)
                bp_buffer = 20
                for read_pair in disc_reads['inv']:
                    r1p, r2p, r1s, r2s, qname = read_pair
                    if r1s == 1 and r2s == 1:
                        if (r1p <= (brkpt1 + bp_buffer)) and (r2p <= (brkpt2 + bp_buffer) and r2p >= (brkpt1 - bp_buffer)):
                            rearr_values['disc_read_count'] += 1
                    else:
                        if (r1p <= (brkpt2 + bp_buffer) and r1p >= (brkpt1 - bp_buffer)) and r2p >= (brkpt2 - bp_buffer):
                            rearr_values['disc_read_count'] += 1 
            elif strands[0] == strands[1]:
                tgap = brkpts[1] - brkpts[0]
                qgap = qcoords[1][0] - qcoords[0][1]
                brkpt1 = min(brkpts)
                brkpt2 = max(brkpts)
                bp_buffer = 20
                if tgap < 0:
                    utils.log(self.logging_name, 'debug', 'Tandem duplication event identified.')
                    rearr_values['hit'] = True
                    rearr_values['sv_subtype'] = 'tandem_dup'
                    rearr_values['disc_read_count'] = 0
                    for read_pair in disc_reads['td']:
                        r1p, r2p, r1s, r2s, qname = read_pair
                        disc_read_check = (r1p >= (brkpt1 - bp_buffer) and r1p <= (brkpt2 + bp_buffer)) and (r2p <= (brkpt2 + bp_buffer) and r2p >= (brkpt1 - bp_buffer))
                        if disc_read_check:
                            rearr_values['disc_read_count'] += 1
                elif tgap > qgap:
                    # Gapped deletion from Blast result
                    utils.log(self.logging_name, 'debug', 'Deletion event identified.')
                    rearr_values['hit'] = True
                    rearr_values['sv_type'] = 'indel'
                    rearr_values['indelSize'] = 'D' + str(tgap)
                else:
                    # Gapped insertion from Blast result
                    utils.log(self.logging_name, 'debug', 'Insertion event identified.')
                    rearr_values['hit'] = True
                    rearr_values['sv_type'] = 'indel'
                    rearr_values['indelSize'] = 'I' + str(qgap)
        return rearr_values
Esempio n. 19
0
 def check_read_strands(self):
   same_strand = False
   strands = []
   for read in self.contig_reads:
     strand = read.id.split("/")[1] 
     strands.append(strand)
   if len(set(strands)) == 1:
     same_strand = True
   utils.log(self.logging_name, 'debug', 'Checking read strands for contig reads %s' % (",".join([read.id for read in self.contig_reads])))
   utils.log(self.logging_name, 'debug', 'Reads are on same strand: %r' % same_strand)
   return same_strand
Esempio n. 20
0
 def filter_rearr(self, query_region, params, brkpts, brkpt_counts, brkpt_kmers, rearr_type, disc_read_count):
   print 'Breakpoint counts', brkpt_counts
   print params.opts
   in_ff, span_ff = utils.filter_by_feature(brkpts, query_region, params.get_param('keep_intron_vars'))
   filter = (min(brkpt_counts['n']) < params.get_param('rearr_sr_thresh')) or self.br_sorted[0][1] < params.get_param('rearr_minseg_len') or (in_ff and span_ff) or (disc_read_count < 1) or (rearr_type == 'rearrangement') or (min(brkpt_kmers) == 0)
   utils.log(self.logging_name, 'info' ,'Check filter for rearrangement')
   utils.log(self.logging_name, 'info', 'Filter by feature for being in exon (%r) or spanning exon (%r)'%(in_ff, span_ff))
   utils.log(self.logging_name, 'info', 'Split read threshold %d, breakpoint read counts %d'%(min(brkpt_counts['n']), params.get_param('rearr_minseg_len')))
   utils.log(self.logging_name, 'info', 'Minimum segment length observed (%d) meets threshold (%d)'%(self.br_sorted[0][1], params.get_param('rearr_minseg_len')))
   utils.log(self.logging_name, 'info', 'Minimum discordant read pairs for rearrangement (%d)'%(disc_read_count))
   return filter
Esempio n. 21
0
    def format_rearrangement_values(self, svEvent):
        """ """
        utils.log(self.loggingName, 'info', 'Resolving SVs call from blat results')
        # Sort the stored blat results by the number of matches to the reference sequence.
        blatResSorted = sorted(svEvent.blatResults, key=lambda x: x[0])
        resultValid = {'valid': True, 'repeatValid': True}
        maxRepeat = 0.0

        self.totalMatching = []
        self.repeatOverlapPercent = []
        self.realignmentUniqueness = []
        self.genes = []
        self.alignCigar = []
        self.strands = []
        self.totalMismatches = []

        for i, blatResultTuple in enumerate(blatResSorted):
            blatResult = blatResultTuple[1]
            resultValid['valid'] = resultValid['valid'] and blatResult.valid
            maxRepeat = max(maxRepeat, blatResult.repeat_overlap)
            self.repeatOverlapPercent.append(blatResult.repeat_overlap)
            self.realignmentUniqueness.append(blatResult.alignFreq)
            self.totalMatching.append(blatResult.get_nmatch_total())
            self.genes.append(blatResult.get_gene_anno())
            self.alignCigar.append(blatResult.cigar)
            self.strands.append(blatResult.strand)
            self.totalMismatches.append(blatResult.get_nmatches('mismatch'))
            svEvent.brkpts.update_brkpt_info(blatResult, i, i == (len(blatResSorted) - 1))

        # Sort the blatResultsSorted list by the lowest matching result to the highest matching result
        svEvent.blatResultsSorted = sorted(svEvent.blatResultsSorted, key=lambda x: x[1])
        if svEvent.brkpts.diff_chr():
            # translocation event
            # print 'sv_caller.py format_rearrangement_values(), set trl values', svEvent.contig.meta.id, svEvent.contig.seq
            svEvent.set_brkpt_counts('trl')
            self.discReadCount = svEvent.get_disc_read_count()
            self.svType = 'rearrangement'
            self.svSubtype = 'trl'
            self.filterValues.set_trl_values(svEvent)
        else:
            svEvent.set_brkpt_counts('rearr')
            self.svType, self.svSubtype, self.discReadCount = svEvent.define_rearr()
            self.genes = list(set(self.genes))
            self.description = svEvent.rearrDesc
            self.filterValues.set_rearr_values(svEvent)
        self.realignmentUniqueness = self.filterValues.realignFreq
        self.targetName = svEvent.contig.get_target_name()
        self.fullBreakpointStr = svEvent.get_brkpt_str('all')
        self.targetBreakpointStr = svEvent.get_brkpt_str('target')
        self.breakpointCoverageDepth = svEvent.get_brkpt_depths()
        self.splitReadCount = svEvent.get_splitread_count()
        self.contigSeq = svEvent.get_contig_seq()
        self.contigId = svEvent.get_contig_id()
Esempio n. 22
0
    def call_svs(self):
        """ """

        if not self.realignment.has_results():
            utils.log(self.loggingName, 'info', 'No blat results file exists, no calls for %s.' % self.contig.get_id())
        else:
            utils.log(self.loggingName, 'info', 'Making variant calls from blat results %s' % self.realignment.get_result_fn())
            if self.check_indels():
                self.svEvent.format_indel_values()
            elif self.check_svs():
                self.svEvent.format_rearr_values()
        return self.svEvent
Esempio n. 23
0
    def set_params(self, arguments):
        """Organize and format all input parameters into class variables to access
        later. Specific instances of parameters are checked and set. All parameters that are
        set are logged. The target objects are set along with the paths.

        Args:
            arguments (dict): The argparse dictionary object from the command line options.
        Returns:
            None
        Raises:
            None
        """

        self.parse_opts(arguments)  # Parse the config file and command line parameters into the self.opts dictionary.
        utils.setup_logger(self.get_param('analysis_dir', True), 'breakmer')  # Create logging object.
        utils.log(self.loggingName, 'info', 'Setting up parameters')

        # Log all parameters passed in, warn for poor paths
        for paramKey, paramValue in self.opts.items():
            utils.log(self.loggingName, 'info', '%s = %s' % (paramKey, paramValue))

        self.set_targets()
        self.paths['ref_data'] = os.path.abspath(os.path.normpath(self.opts['reference_data_dir']))  # Path to target reference sequence fast files.
        self.set_param('reference_fasta_dir', os.path.split(self.opts['reference_fasta'])[0])  # Path to genome fasta file.

        # If only preseting the reference data no need to continue.
        if self.fncCmd == 'prepare_reference_data':
            self.set_insertsize_thresh()  # Set the expected insert size threshold from the properly mapped read pairs.
            utils.log(self.loggingName, 'info', 'Preset reference data option set! Only the reference data directory will be setup.')
            return

        # Setup directories
        self.paths['analysis'] = os.path.abspath(os.path.normpath(self.opts['analysis_dir']))
        self.paths['output'] = os.path.join(self.paths['analysis'], 'output')
        if 'targets_dir' in self.opts:
            self.paths['targets'] = os.path.abspath(os.path.normpath(self.opts['targets_dir']))
        else:
            self.paths['targets'] = os.path.join(self.paths['analysis'], 'targets')

        # Create all the paths.
        for path in self.paths:
            utils.log(self.loggingName, 'info', 'Creating %s directory (%s)' % (path, self.paths[path]))
            if not os.path.exists(self.paths[path]):
                os.makedirs(self.paths[path])

        # If starting the blat server then return.
        if self.fncCmd == 'start_blat_server':
            utils.log(self.loggingName, 'info', 'Starting the blat server.')
            return

        self.check_binaries()  # Check if Jellyfish and Cutadapt work.
        self.filter = resultfilter.ResultFilter(self.get_param('filterList'), self)  # Instantiate the filter class.
        self.set_insertsize_thresh()  # Set the expected insert size threshold from the properly mapped read pairs.
Esempio n. 24
0
    def add_regions(self, regions_bed_fn):

        '''
        '''

        region_f = open(regions_bed_fn, 'rU')
        region_lines = region_f.readlines()
        for line in region_lines:
            line = line.strip()
            chrom, start, end, name = line.split()
            if name not in self.genes:
                self.genes[name] = [chrom, int(start), int(end)]
        utils.log(self.logging_name, 'info', 'Adding in %d other target regions' % len(region_lines))
Esempio n. 25
0
    def write_contig_values(self, contig_id, cluster_fn, target_contig_path):
        '''
        '''

        self.contig_id = contig_id
        self.file_path = os.path.join(target_contig_path, contig_id)

        utils.log(self.logging_name, 'info',
                  'Setting up contig path %s' % self.file_path)
        if not os.path.exists(self.file_path):
            os.makedirs(self.file_path)
        self.write_cluster_file(cluster_fn)
        self.write_read_fq(os.path.join(self.file_path, contig_id + ".fq"))
        self.write_contig_fa()
Esempio n. 26
0
 def get_seq_complexity(self):
     """Get the 3-mer complexity of the shortest aligned blat sequence.
     """
     blatResult, nBasesAligned = self.blatResultsSorted[0]
     alignedSeq = self.contig.seq[blatResult.qstart():blatResult.qend()]
     merSize = 3
     utils.log(self.loggingName, 'debug', 'Checking sequence complexity of blat result segment %s using %d-mers' % (alignedSeq, merSize))
     nmers = {}
     totalMersPossible = len(alignedSeq) - 2
     for i in range(len(alignedSeq) - (merSize - 1)):
         nmers[str(alignedSeq[i:i + merSize]).upper()] = True
     complexity = round((float(len(nmers)) / float(totalMersPossible)) * 100, 4)
     utils.log(self.loggingName, 'debug', 'Complexity measure %f, based on %d unique %d-mers observed out of a total of %d %d-mers possible' % (complexity, len(nmers), merSize, totalMersPossible, merSize))
     return complexity
Esempio n. 27
0
    def write_output(self):

        '''
        '''

        res_fn = os.path.join(self.params.paths['output'], self.params.opts['analysis_name'] + "_svs.out")
        result_file = open(res_fn, 'w')
        header = "\t".join(['genes', 'target_breakpoints', 'mismatches', 'strands', 'total_matching', 'sv_type', 'sv_subtype', 'split_read_count', 'disc_read_count', 'breakpoint_coverages', 'contig_id', 'contig_seq']) + "\n"
        result_file.write(header)

        for result_str in self.results:
            utils.log(self.logging_name, 'info', 'Writing results to file: %s' % res_fn)
            result_file.write(result_str)
        result_file.close()
Esempio n. 28
0
    def filter_indel(self, svEvent):
        """ """
        indelSizeThresh = int(self.params.get_param('indel_size'))
        utils.log(self.loggingName, 'info', 'Checking if blat result contains an indel variant')
        blatResult = svEvent.blatResults[0][1]
        keep_br = blatResult.valid and blatResult.alignFreq < 2 and blatResult.in_target and (blatResult.indel_maxevent_size[0] >= indelSizeThresh)
        utils.log(self.loggingName, 'debug', 'Keep blat result %r' % keep_br)

        # Determine the uniqueness of the realignment.
        svFilterValues = svEvent.resultValues.filterValues
        uniqRealignment = svFilterValues.realignFreq < 2
        indelSize = svFilterValues.maxEventSize >= indelSizeThresh
        brkptCoverages = svFilterValues.brkptCoverages[0] >= self.params.get_sr_thresh('indel')
        minFlankMatches = min(svFilterValues.flankMatchPercents) >= 10.0

        if uniqRealignment and indelSize and brkptCoverages and minFlankMatches:
            utils.log(self.loggingName, 'debug', 'Indel meets basic filtering requirements.')
        else:
            utils.log(self.loggingName, 'debug', 'Indel filtered due to non-unique realignment (%r), less than input size threshold (%r), low coverage at breakpoints (%r), or contig edge realignment not long enough (%r), filter status set to True.' % (uniqRealignment, indelSize, brkptCoverages, minFlankMatches))
            filterReasons = []
            if not uniqRealignment:
                filterReasons.append('Non-unique realignment (%d) > 2' % svFilterValues.realignFreq)
            if not indelSize:
                filterReasons.append('Max indel size (%d) is less than %d' % (svFilterValues.maxEventSize, indelSizeThresh))
            if not brkptCoverages:
                filterReasons.append('Minimum coverage at breakpoints (%d) less than input threshold %d' % (svFilterValues.brkptCoverages[0], self.params.get_sr_thresh('indel')))
            if not minFlankMatches:
                filterReasons.append('Minimum percentage of contig sequence that realigns to the reference to the left or right of the indel event less than 10.0 percent (%d)' % min(svFilterValues.flankMatchPercents))
            svEvent.set_filtered(','.join(filterReasons))
Esempio n. 29
0
    def align(self, alignParams, scope):
        """
        """
        self.alignParams = alignParams
        alignProgram, alignExt, alignBinary, binaryParams, alignRef = self.alignParams
        self.scope = scope

        self.resultFn = os.path.join(self.contig.get_path(), '%s_res.%s.%s' % (alignProgram, scope, alignExt))
        utils.log(self.loggingName, 'info', 'Running realignment with %s, storing results in %s' % (alignProgram, self.resultFn))

        cmd = ''
        if alignProgram == 'blast':
            cmd = "%s -task 'blastn-short' -db %s -query %s -evalue 0.01 -out %s -outfmt '7 qseqid sseqid pident qlen length mismatch gapopen qstart qend sstart send evalue bitscore gaps sstrand qseq sseq'" % (alignBinary, alignRef, self.contig.meta.fa_fn, self.resultFn)
        elif alignProgram == 'blat':
            if scope == 'genome':
                # all blat server
                cmd = '%s -t=dna -q=dna -out=psl -minScore=20 -nohead %s %d %s %s %s' % (alignBinary, binaryParams['hostname'], binaryParams['port'], alignRef, self.contig.meta.fa_fn, self.resultFn)
            elif scope == 'target':
                # target
                cmd = '%s -t=dna -q=dna -out=psl -minScore=20 -stepSize=10 -minMatch=2 -repeats=lower -noHead %s %s %s' % (alignBinary, alignRef, self.contig.meta.fa_fn, self.resultFn)

        utils.log(self.loggingName, 'info', 'Realignment system command %s' % cmd)
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
        output, errors = p.communicate()
        utils.log(self.loggingName, 'info', 'Realignment output file %s' % self.resultFn)
        if errors != '':
            utils.log(self.loggingName, 'info', 'Realignment errors %s' % errors)

        if not os.path.isfile(self.resultFn):
            return False
        else:
            self.results = AlignResults(alignProgram, scope, self.resultFn, self.contig, alignRef)
            return True
Esempio n. 30
0
    def write_contig_values(self, contig_id, cluster_fn, target_contig_path):

        '''
        '''

        self.contig_id = contig_id
        self.file_path = os.path.join(target_contig_path, contig_id)

        utils.log(self.logging_name, 'info', 'Setting up contig path %s' % self.file_path)
        if not os.path.exists(self.file_path):
            os.makedirs(self.file_path)
        self.write_cluster_file(cluster_fn)
        self.write_read_fq(os.path.join(self.file_path, contig_id + ".fq"))
        self.write_contig_fa()
Esempio n. 31
0
    def target_hit(self):
        """ """
        if self.hasResults:
            cond1 = self.results[0].spans_query() and (self.ngaps > 0)
            cond2 = (len(self.results) == 1) and self.get_query_coverage() >= 90.0 and (self.ngaps > 0)
            # cond3 = (len(self.results) > 1) and (self.get_query_coverage() >= 90.0)
            self.targetHit = cond1 or cond2  # or cond3
        utils.log(self.loggingName, 'debug', 'Checking if query is a target hit or not %r' % self.targetHit)

        if self.targetHit:
            if ((len(self.results) > 1) and (self.get_query_coverage() >= 90.0)) and (self.program == 'blast'):
                # Check for a gapped Blast result.
                segments = []
                for i, result in enumerate(self.results):
                    resultOverlap = 0
                    addSegment = True
                    for segment in segments:
                        overlapSeg = (result.qstart() >= segment[0] and result.qstart() <= segment[1]) or (result.qend() >= segment[0] and result.qend() <= segment[1])
                        containSeg = result.qstart() >= segment[0] and result.qend() >= segment[1]
                        withinSeg = result.qstart() >= segment[0] and result.qend() <= segment[1]
                        # print i, result, containSeg, withinSeg, overlapSeg
                        if containSeg or withinSeg:
                            addSegment = False
                        elif overlapSeg:
                            if (result.qstart() >= segment[0] and result.qstart() <= segment[1]):
                                overlapBp = segment[1] - result.qstart()
                                # print 'Overlapbp', overlapBp
                                if overlapBp < 20:
                                    resultOverlap += overlapBp
                                    addSegment = True
                            elif (result.qend() >= segment[0] and result.qend() <= segment[1]):
                                overlapBp = result.qend() - segment[0]
                                # print 'Overlapbp', overlapBp
                                if overlapBp < 20:
                                    resultOverlap += overlapBp
                                    addSegment = True
                    if addSegment and (result.get_query_span() - resultOverlap) > 20:
                        segments.append((result.qstart(), result.qend(), result))
                self.targetSegmentsSorted = sorted(segments, key=lambda x: x[0])
                for i in range(1, len(self.targetSegmentsSorted)):
                    lResult = self.targetSegmentsSorted[i - 1][2]
                    rResult = self.targetSegmentsSorted[i][2]
                    qgap = rResult.qstart() - lResult.qend()
                    tgap = rResult.tstart() - lResult.tend()
                    if (tgap < 0 and (abs(tgap) > abs(qgap))) or (lResult.strand != rResult.strand):
                        # Tandem dup or inversion
                        break
                    else:
                        self.mergedRecords.append((i - 1, i))
        return self.targetHit
Esempio n. 32
0
    def check_read_strands(self, sv_result):

        '''
        '''

        same_strand = False
        strands = []
        for read in sv_result.contig.reads:
            strand = read.id.split("/")[1]
            strands.append(strand)
        if len(set(strands)) == 1:
            same_strand = True
        utils.log(self.logging_name, 'debug', 'Checking read strands for contig reads %s' % (",".join([read.id for read in sv_result.contig.reads])))
        utils.log(self.logging_name, 'debug', 'Reads are on same strand: %r' % same_strand)
        return same_strand
Esempio n. 33
0
    def resolve_sv_calls(self, contigs, target_ref_fn, target_region_values, disc_reads):

        '''
        '''
        sv_results = []
        for assembled_contig in contigs:
            utils.log(self.logging_name, 'info', 'Assessing contig %s' % assembled_contig.seq.value)
            # pdb.set_trace()
            realignment_set = self.align_manager.realignment(assembled_contig, target_ref_fn, target_region_values)
            sv_result = results.SVResult(self.make_call(assembled_contig, target_region_values, realignment_set), self.params.get_param('sample_bam_file'), assembled_contig, target_region_values, disc_reads)
            if not sv_result.filter:
                self.filter_manager.filter_result(sv_result)
            if not sv_result.filter:
                sv_results.append(sv_result)
        return sv_results
Esempio n. 34
0
    def make_call(self, contig, region_values, realignment_result_set):

        '''
        '''

        sv_event = None
        if not realignment_result_set.has_results:
            utils.log(self.logging_name, 'info', 'No blat results file %s, no calls for %s.' % (self.align_manager.query_res_fn, contig.contig_id))
        else:
            utils.log(self.logging_name, 'info', 'Making variant calls from blat results %s' % self.align_manager.query_res_fn)
            if realignment_result_set.has_indel():
                sv_event = realignment_result_set.sv_event  # get_indel_result()
            elif realignment_result_set.check_svs():
                sv_event = realignment_result_set.sv_event  #bm.get_svs_result()
        return sv_event
Esempio n. 35
0
    def run(self):
        """Create and analyze the target regions.
        The target objects are made and grouped for multiprocessing (if set)
        and these are all analyzed independently. This is where the analysis
        starts and ends.

        Args:
            None
        Returns
            None
        """

        startTime = time.clock()  # Track the run time.

        self.params.start_blat_server()
        if self.params.fncCmd == 'start_blat_server':
            print 'Server started!'
            return

        targetAnalysisList = self.create_targets()

        aggResults = {'contigs': [], 'discreads': []}  # Buffer the formatted output strings for each target to write out in batch.
        nprocs = int(self.params.get_param('nprocs'))
        if nprocs > 1:  # Make use of multiprocessing by mapping targets to n jobs.
            utils.log(self.loggingName, 'info', 'Creating all reference data.')
            p = multiprocessing.Pool(nprocs)
            multiprocResults = []
            for targetList in targetAnalysisList:
                multiprocResults.append(p.apply_async(analyze_targets, (targetList, )))
            wait(multiprocResults)
            for multiprocResult in multiprocResults:
                a = multiprocResult.get()
                aggResults['contigs'].extend(a['contigs'])
                aggResults['discreads'].extend(a['discreads'])
        else:
            aggResults = analyze_targets(targetAnalysisList)

        if self.params.fncCmd == 'prepare_reference_data':
            print 'Reference data setup!'
            return

        self.write_aggregated_output(aggResults)
        utils.log(self.loggingName, 'info', 'Analysis complete in %s' % str(time.clock() - startTime))

        if not self.params.get_param('keep_blat_server'):  # Keep blat server is specified.
            cmd = '%s stop %s %d' % (self.params.get_param('gfserver'), self.params.get_param('blat_hostname'), int(self.params.get_param('blat_port')))
            os.system(cmd)
        print 'Analysis complete!'
Esempio n. 36
0
    def write_results(self):

        '''
        '''

        res_fn = os.path.join(self.paths['output'], self.name + "_svs.out")
        result_file = open(res_fn, 'w')
        header = "\t".join(['genes', 'target_breakpoints', 'mismatches', 'strands', 'total_matching', 'sv_type', 'sv_subtype', 'split_read_count', 'disc_read_count', 'breakpoint_coverages', 'contig_id', 'contig_seq']) + "\n"
        result_file.write(header)

        for res in self.results:
            utils.log(self.logging_name, 'info', 'Writing results to file: %s' % res_fn)
            formatted_result_str = res.get_output_string()
            result_file.write(formatted_result_str)
            self.formatted_results.append(formatted_result_str)
        result_file.close()
Esempio n. 37
0
    def target_aligned(self):
        """
        """
        noAlignmentResults = False
        targetHit = False
        if self.results is None:
            noAlignmentResults = True
        else:
            self.results.modify_blat_result_file()
            if self.results.target_hit():
                self.targetHit = True
                utils.log(self.loggingName, 'debug', 'Top hit contains whole query sequence, indicating an indel variant within the target region.')

        # If there was a sufficient target hit or no alignment at all then return True
        # this effectively prevents a genome alignment.
        return self.targetHit or noAlignmentResults
Esempio n. 38
0
    def filter_rearr(self, svEvent):
        # in_ff, span_ff = filter_by_feature(brkpts, query_region, params.opts['keep_intron_vars'])
        # filter = (min(brkpt_counts['n']) < params.get_sr_thresh('rearrangement')) or self.blatResultsSorted[0][1] < params.get_min_segment_length('rearr') or (in_ff and span_ff) or (disc_read_count < 1) or (rearr_type == 'rearrangement') or (min(brkpt_kmers) == 0)
        svFilterValues = svEvent.resultValues.filterValues
        # print self.params.get_sr_thresh('rearrangement')
        # print svFilterValues.brkptCoverages
        missingQueryCoverage = svFilterValues.missingQueryCoverage < self.params.get_min_segment_length(
            'rearr')
        brkptCoverages = svFilterValues.brkptCoverages[
            0] >= self.params.get_sr_thresh('rearrangement')
        minSegmentLen = svFilterValues.minSegmentLen >= self.params.get_min_segment_length(
            'rearr')
        # discReadCount = svEvent.resultValues.discReadCount >= 0
        minBrkptKmers = svFilterValues.minBrkptKmers > 0

        if brkptCoverages and minSegmentLen and minBrkptKmers:
            utils.log(self.loggingName, 'info',
                      'Rearrangement meets basic filtering requirements.')
        else:
            filteredReasons = []
            if not missingQueryCoverage:
                logMsg = 'No realignment for %d bases in the contig sequence, more than threshold %d.' % (
                    svFilterValues.missingQueryCoverage,
                    self.params.get_min_segment_length('rearr'))
                utils.log(self.loggingName, 'info', logMsg)
                filteredReasons.append(logMsg)
            if not brkptCoverages:
                logMsg = 'Minimum coverage at breakpoints (%d) less than input threshold %d.' % (
                    svFilterValues.brkptCoverages[0],
                    self.params.get_sr_thresh('rearrangement'))
                utils.log(self.loggingName, 'info', logMsg)
                filteredReasons.append(logMsg)
            if not minSegmentLen:
                logMsg = 'The minimum realigned segment length (%d) is less than the input threshold %d.' % (
                    svFilterValues.minSegmentLen,
                    self.params.get_min_segment_length('rearr'))
                utils.log(self.loggingName, 'info', logMsg)
                filteredReasons.append(logMsg)
            # if not discReadCount:
            #     logMsg = 'The number of discordant read pairs supporting the event (%d) is less than 1.' % svEvent.resultValues.discReadCount
            #     utils.log(self.loggingName, 'info', logMsg)
            #     filteredReasons.append(logMsg)
            if not minBrkptKmers:
                logMsg = 'There were no variant kmers at the one or more of the breakpoint locations.'
                utils.log(self.loggingName, 'info', logMsg)
                filteredReasons.append(logMsg)
            svEvent.set_filtered(','.join(filteredReasons))
Esempio n. 39
0
 def which_rearr(self, varReads, tcoords, qcoords, strands, brkpts):
     rearrValues = {
         'discReadCount': None,
         'svType': 'rearrangement',
         'svSubType': None,
         'hit': False
     }
     if not self.check_overlap(tcoords[0], tcoords[1]):
         utils.log(
             self.loggingName, 'debug',
             'Checking rearrangement svType, strand1 %s, strand2 %s, breakpt1 %d, breakpt %d'
             % (strands[0], strands[1], brkpts[0], brkpts[1]))
         if (strands[0] != strands[1]):  # and (brkpts[0] < brkpts[1]):
             # Inversion
             # Get discordantly mapped read-pairs
             utils.log(self.loggingName, 'debug',
                       'Inversion event identified.')
             rearrValues['hit'] = True
             rearrValues['svSubType'] = 'inversion'
             rearrValues['discReadCount'] = varReads.check_inv_readcounts(
                 brkpts)
         elif (strands[0] == strands[1]):
             tgap = brkpts[1] - brkpts[0]
             qgap = qcoords[1][0] - qcoords[0][1]
             if tgap < 0:
                 utils.log(self.loggingName, 'debug',
                           'Tandem duplication event identified.')
                 rearrValues['hit'] = True
                 rearrValues['svSubType'] = 'tandem_dup'
                 rearrValues[
                     'discReadCount'] = varReads.check_td_readcounts(brkpts)
             elif tgap > qgap:
                 # Gapped deletion from Blast result
                 utils.log(self.loggingName, 'debug',
                           'Deletion event identified.')
                 rearrValues['hit'] = True
                 rearrValues['svType'] = 'indel'
                 rearrValues['indelSize'] = 'D' + str(tgap)
             else:
                 # Gapped insertion from Blast result
                 utils.log(self.loggingName, 'debug',
                           'Insertion event identified.')
                 rearrValues['hit'] = True
                 rearrValues['svType'] = 'indel'
                 rearrValues['indelSize'] = 'I' + str(qgap)
     return rearrValues
Esempio n. 40
0
    def set_ref_data(self):

        '''
        '''

        # Write rmask bed file if needed.
        # if not self.params.opts['keep_repeat_regions'] and 'repeat_mask_file' in self.params.opts:
        #   self.logger.info('Extracting repeat mask regions for target gene %s.' % self.name)
        #   self.repeat_mask = setup_rmask(self.get_values(), self.paths['ref_data'], self.params.opts['repeat_mask_file'])

        # Write reference fasta file if needed.
        for target_refseq_fn in self.files['target_ref_fn']:
            direction = "forward"
            if target_refseq_fn.find("forward") == -1:
                direction = "reverse"
            utils.log(self.logging_name, 'info', 'Extracting refseq sequence and writing %s' % target_refseq_fn)
            utils.extract_refseq_fa(self.get_values(), self.paths['ref_data'], self.params.get_param('reference_fasta'), direction, target_refseq_fn, self.params.get_param('buffer_size'))
Esempio n. 41
0
    def add_path(self, key, path):

        '''Utility function to create all the output directories.

        Args:
            key (str):  String value to store the file path value.
            path (str): File path value.
        Returns:
            None
        Raises:
            None
        '''

        utils.log(self.logging_name, 'info', 'Creating %s %s path (%s)' % (self.name, key, path))
        self.paths[key] = path
        if not os.path.exists(self.paths[key]):
            os.makedirs(self.paths[key])
Esempio n. 42
0
    def compare_kmers(self):

        '''
        '''

        self.kmers['ref'] = {}
        jellyfish = self.params.get_param('jellyfish')
        kmer_size = int(self.params.get_param('kmer_size'))

        for i in range(len(self.files['target_ref_fn'])):
            utils.log(self.logging_name, 'info', 'Indexing kmers for reference sequence %s' % self.files['target_ref_fn'][i])
            self.kmers['ref'] = utils.load_kmers(utils.run_jellyfish(self.files['target_ref_fn'][i], jellyfish, kmer_size), self.kmers['ref'])

        # if 'target_altref_fn' in self.files:
        #     for i in range(len(self.files['target_altref_fn'])):
        #         for j in range(len(self.files['target_altref_fn'][i])):
        #             utils.log(self.logging_name, 'info', 'Indexing kmers for reference sequence %s' % self.files['target_altref_fn'][i])
        #             self.kmers['ref'] = utils.load_kmers(utils.run_jellyfish(self.files['target_altref_fn'][i][j], jellyfish, kmer_size), self.kmers['ref'])

        utils.log(self.logging_name, 'info', 'Indexing kmers for sample sequence %s' % self.files['sv_cleaned_fq'])
        self.kmers['case'] = {}
        self.kmers['case'] = utils.load_kmers(utils.run_jellyfish(self.files['sv_cleaned_fq'], jellyfish, kmer_size), self.kmers['case'])
        self.kmers['case_sc'] = {}
        self.kmers['case_sc'] = utils.load_kmers(utils.run_jellyfish(self.files['sv_sc_unmapped_fa'], jellyfish, kmer_size), self.kmers['case_sc'])
        sc_mers = set(self.kmers['case'].keys()) & set(self.kmers['case_sc'])
        sample_only_mers = list(sc_mers.difference(set(self.kmers['ref'].keys())))

        if 'normal_bam_file' in self.params.opts:
            norm_kmers = {}
            norm_kmers = utils.load_kmers(utils.run_jellyfish(self.files['norm_cleaned_fq'], jellyfish, kmer_size), norm_kmers)
            sample_only_mers = set(sample_only_mers).difference(set(norm_kmers.keys()))

        sample_only_mers = list(sample_only_mers)

        # Write case only kmers out to file.
        self.files['sample_kmers'] = os.path.join(self.paths['kmers'], self.name + "_sample_kmers.out")
        sample_kmer_fout = open(self.files['sample_kmers'], 'w')

        self.kmers['case_only'] = {}
        for mer in sample_only_mers:
            sample_kmer_fout.write("\t".join([str(x) for x in [mer, str(self.kmers['case'][mer])]]) + "\n")
            self.kmers['case_only'][mer] = self.kmers['case'][mer]
        sample_kmer_fout.close()

        self.kmers['ref'] = {}
        self.kmers['case'] = {}
        self.kmers['case_sc'] = {}

        utils.log(self.logging_name, 'info', 'Writing %d sample-only kmers to file %s' % (len(self.kmers['case_only']), self.files['sample_kmers']))
        self.files['kmer_clusters'] = os.path.join(self.paths['kmers'], self.name + "_sample_kmers_merged.out")
        utils.log(self.logging_name, 'info', 'Writing kmer clusters to file %s' % self.files['kmer_clusters'])

        self.contigs = assembler.init_assembly(self.kmers['case_only'], self.cleaned_read_recs['sv'], kmer_size, int(self.params.get_param('trl_sr_thresh')), self.params.get_param('read_len'))
        self.cleaned_read_recs = None
        self.kmers['case_only'] = {}
        self.finalize_contigs()
Esempio n. 43
0
    def write_aggregated_output(self, aggregateResults):
        """Write the SV calls to a top level file in the specified output directory.
        Header is written at the top of the file if option to remove is not
        specified.

        The output files are:
            <output_dir>/<analysis_name>_svs.out
            <output_dir>/<analysis_name>_discreads.out

        Args:
            aggregateResults (dict): A dictionary containing the formatted output string values.
        Returns:
            None
        """

        # Write assembled contig-based SV calls.
        if len(aggregateResults['contigs']) > 0:
            allResultFn = os.path.join(self.params.paths['output'], self.params.get_param('analysis_name') + "_svs.all.out")
            filteredResultFn = os.path.join(self.params.paths['output'], self.params.get_param('analysis_name') + "_svs.out")
            utils.log(self.loggingName, 'info', 'Writing %s aggregated results files: all result - %s and filtered results - %s' % (self.params.get_param('analysis_name'), allResultFn, filteredResultFn))
            allResultFile = open(allResultFn, 'w')
            filteredResultFile = open(filteredResultFn, 'w')
            for i, formattedResultStr in enumerate(aggregateResults['contigs']):
                headerStr, formattedResultValuesStr = formattedResultStr
                if not self.params.get_param('no_output_header') and i == 0:
                    allResultFile.write(headerStr + '\n')
                    filteredResultFile.write(headerStr + '\n')
                allResultFile.write(formattedResultValuesStr + '\n')
                resultValues = formattedResultValuesStr.split('\t')
                if resultValues[-3] != "True":
                    filteredResultFile.write(formattedResultValuesStr + '\n')
            allResultFile.close()
            filteredResultFile.close()

        # Write discordant read pair clusters.
        if len(aggregateResults['discreads']) > 0:
            resultFn = os.path.join(self.params.paths['output'], self.params.get_param('analysis_name') + "_discreads.out")
            utils.log(self.loggingName, 'info', 'Writing %s aggregated results file %s' % (self.params.get_param('analysis_name'), resultFn))
            resultFile = open(resultFn, 'w')
            for i, formattedResultStr in enumerate(aggregateResults['discreads']):
                headerStr, formattedResultValuesStr = formattedResultStr
                if not self.params.get_param('no_output_header') and i == 0:
                    resultFile.write(headerStr + '\n')
                resultFile.write(formattedResultValuesStr + '\n')
            resultFile.close()
Esempio n. 44
0
    def call_svs(self):
        """ """

        if not self.realignment.has_results():
            utils.log(
                self.loggingName, 'info',
                'No blat results file exists, no calls for %s.' %
                self.contig.get_id())
        else:
            utils.log(
                self.loggingName, 'info',
                'Making variant calls from blat results %s' %
                self.realignment.get_result_fn())
            if self.check_indels():
                self.svEvent.format_indel_values()
            elif self.check_svs():
                self.svEvent.format_rearr_values()
        return self.svEvent
Esempio n. 45
0
    def missing_query_coverage(self, sv_result):

        '''
        '''

        missing_cov = 0
        for i in sv_result.query_cov:
          if i == 0:
            missing_cov += 1
          else:
            break

        for i in reversed(sv_result.query_cov):
          if i == 0:
            missing_cov += 1
          else:
            break

        perc_missing = round((float(missing_cov)/float(len(sv_result.contig.seq.value)))*100, 4)
        utils.log(self.logging_name, 'debug', 'Calculated %f missing coverage of blat query sequence at beginning and end' % perc_missing)
        return perc_missing
Esempio n. 46
0
 def get_seq_complexity(self):
     """Get the 3-mer complexity of the shortest aligned blat sequence.
     """
     blatResult, nBasesAligned = self.blatResultsSorted[0]
     alignedSeq = self.contig.seq[blatResult.qstart():blatResult.qend()]
     merSize = 3
     utils.log(
         self.loggingName, 'debug',
         'Checking sequence complexity of blat result segment %s using %d-mers'
         % (alignedSeq, merSize))
     nmers = {}
     totalMersPossible = len(alignedSeq) - 2
     for i in range(len(alignedSeq) - (merSize - 1)):
         nmers[str(alignedSeq[i:i + merSize]).upper()] = True
     complexity = round(
         (float(len(nmers)) / float(totalMersPossible)) * 100, 4)
     utils.log(
         self.loggingName, 'debug',
         'Complexity measure %f, based on %d unique %d-mers observed out of a total of %d %d-mers possible'
         % (complexity, len(nmers), merSize, totalMersPossible, merSize))
     return complexity
Esempio n. 47
0
    def add_genes(self, gene_fn):

        '''
        '''

        utils.log(self.logging_name, 'info', 'Adding gene annotations from %s' % gene_fn)
        gene_f = open(gene_fn, 'r')
        gene_flines = gene_f.readlines()
        for line in gene_flines[1:]:
            line = line.strip()
            linesplit = line.split()
            chrom = linesplit[2]
            start = int(linesplit[4])
            end = int(linesplit[5])
            geneid = linesplit[12]
            if geneid in self.genes:
                if start <= self.genes[geneid][1] and end >= self.genes[geneid][2]:
                    self.genes[geneid] = [chrom, start, end]
            else:
                self.genes[geneid] = [chrom, start, end]
        gene_f.close()
Esempio n. 48
0
    def check_svs(self):

        '''
        '''

        utils.log(self.logging_name, 'info', 'Checking for SVs')
        gaps = [(0, self.qsize)]

        # if len(self.clipped_qs) > 1:
        utils.log(self.logging_name, 'debug', 'Iterating through %d clipped blat results.' % len(self.realignments))
        merged_clip = [0, None]

        for i, realigned_segment in enumerate(self.realignments):
            # qs, qe, blat_result, idx = clipped_qs
            utils.log(self.logging_name, 'debug', 'Blat result with start %d, end %d, chrom %s' % (realigned_segment.qstart(), realigned_segment.qend(), realigned_segment.get_name('hit')))
            gaps = self.iter_gaps(gaps, realigned_segment, i)
            if self.sv_event.qlen > merged_clip[0]:
                merged_clip = [self.sv_event.qlen, self.sv_event]
        self.sv_event = merged_clip[1]
        # else:
        #   utils.log(self.logging_name, 'info', 'There are no more than 1 clipped blat results, not continuing with SVs calling.')

        # print 'Check valid', self.se_valid()
        valid_result = self.sv_event is not None and (len(self.sv_event.realignments) > 1) and self.sv_event.in_target
        if valid_result:
            self.sv_event.set_event_type('rearrangement')
        return valid_result
Esempio n. 49
0
    def filter_rearr(self, sv_result): # query_region, params, brkpts, brkpt_counts, brkpt_kmers, rearr_type, disc_read_count):

        '''
        '''

        in_ff, span_ff = utils.filter_by_feature(sv_result.breakpoint_values['ref_pos'], sv_result.query_region, self.params.get_param('keep_intron_vars'))
        match_sorted_realignments = sorted(sv_result.sv_event.realignments, key=lambda x: x.get_nmatch_total())
        top_realigned_segment = match_sorted_realignments[0]

        check1 = (min(sv_result.breakpoint_values['counts']['n']) < self.params.get_param('rearr_sr_thresh'))
        check2 = top_realigned_segment.get_nmatch_total() < self.params.get_param('rearr_minseg_len')
        check3 = (in_ff and span_ff)
        check4 = (sv_result.values['disc_read_count'] < 1)
        check5 = (sv_result.values['sv_subtype'] == 'NA')
        check6 = (min(sv_result.breakpoint_values['kmers']) == 0)
        filter_result = check1 or check2 or check3 or check4 or check5 or check6
        utils.log(self.logging_name, 'info', 'Check filter for rearrangement')
        utils.log(self.logging_name, 'info', 'Filter by feature for being in exon (%r) or spanning exon (%r)' % (in_ff, span_ff))
        utils.log(self.logging_name, 'info', 'Split read threshold %d, breakpoint read counts %d' % (min(sv_result.breakpoint_values['counts']['n']), self.params.get_param('rearr_minseg_len')))
        utils.log(self.logging_name, 'info', 'Minimum segment length observed (%d) meets threshold (%d)' % (top_realigned_segment.get_nmatch_total(), self.params.get_param('rearr_minseg_len')))
        utils.log(self.logging_name, 'info', 'Minimum discordant read pairs for rearrangement (%d)' % (sv_result.values['disc_read_count']))
        return filter_result
Esempio n. 50
0
    def iter_gaps(self, gaps, realigned_segment, segment_idx):

        '''
        '''

        new_gaps = []
        # qs, qe, br, idx = cq
        segment_start = realigned_segment.qstart()
        segment_end = realigned_segment.qend()
        hit = False
        for gap in gaps:
            gap_start, gap_end = gap
            utils.log(self.logging_name, 'debug', 'Gap coords %d, %d' % (gap_start, gap_end))

            # Options to consider when to add segment to reconstruction
            start_within_gap = (segment_start >= gap_start and segment_start <= gap_end)
            end_within_gap = (segment_end <= gap_end and segment_end >= gap_start)
            gap_edge_dist_start = (segment_start <= gap_start) and ((gap_start - segment_start) < 15)
            gap_edge_dist_end = (segment_end >= gap_end) and ((segment_end - gap_end) < 15)
            opt1 = (gap_edge_dist_start and (end_within_gap or gap_edge_dist_end))
            opt2 = (gap_edge_dist_end and (start_within_gap or gap_edge_dist_start))

            if start_within_gap or end_within_gap or opt1 or opt2:
                ngap = []
                if segment_start > gap_start:
                    if (segment_start-1 - gap_start) > 10:
                        ngap.append((gap_start, segment_start-1))
                if segment_end < gap_end:
                    if (gap_end - segment_end+1) > 10:
                        ngap.append((segment_end + 1, gap_end))
                if segment_idx == 0:
                    utils.log(self.logging_name, 'debug', 'Creating SV event from blat result with start %d, end %d'%(segment_start, segment_end))
                    self.sv_event = results.SVEvent(realigned_segment) #, self.meta_dict['query_region'], self.meta_dict['contig_vals'], self.meta_dict['sbam'])
                    new_gaps.extend(ngap)
                    hit = True
                elif self.check_add_br(segment_start, segment_end, gap_start, gap_end, realigned_segment):
                    utils.log(self.logging_name, 'debug', 'Adding blat result to event')
                    new_gaps.extend(ngap)
                    self.sv_event.add(realigned_segment)
                    hit = True
                else:
                    new_gaps.append(gap)
            else:
                new_gaps.append(gap)
            utils.log(self.logging_name, 'debug', 'New gap coords %s'%(",".join([str(x) for x in new_gaps])))

        if not hit:
            self.sv_event.check_previous_add(realigned_segment)
        return new_gaps
Esempio n. 51
0
 def iter_gaps(self, gaps, clippedQuerySeqVals, iterIdx):
     """ """
     new_gaps = []
     qs, qe, blatResult, idx = clippedQuerySeqVals
     hit = False
     for gap in gaps:
         gs, ge = gap
         utils.log(self.loggingName, 'debug',
                   'Gap coords %d, %d' % (gs, ge))
         startWithinGap = (qs >= gs and qs <= ge)
         endWithinGap = (qe <= ge and qe >= gs)
         gapEdgeDistStart = (qs <= gs) and ((gs - qs) < 15)
         gapEdgeDistEnd = (qe >= ge) and ((qe - ge) < 15)
         if startWithinGap or endWithinGap or (
                 gapEdgeDistStart and (endWithinGap or gapEdgeDistEnd)) or (
                     gapEdgeDistEnd and
                     (startWithinGap or gapEdgeDistStart)):
             ngap = []
             if qs > gs:
                 if (qs - 1 - gs) > 10:
                     ngap.append((gs, qs - 1))
             if qe < ge:
                 if (ge - qe + 1) > 10:
                     ngap.append((qe + 1, ge))
             if iterIdx == 0:
                 utils.log(
                     self.loggingName, 'debug',
                     'Creating SV event from blat result with start %d, end %d'
                     % (qs, qe))
                 self.svEvent = SVEvent(blatResult, self.contig,
                                        'rearrangement')
                 new_gaps.extend(ngap)
                 hit = True
             elif self.check_add_br(qs, qe, gs, ge, blatResult):
                 utils.log(self.loggingName, 'debug',
                           'Adding blat result to event')
                 new_gaps.extend(ngap)
                 self.svEvent.add(blatResult)
                 hit = True
             else:
                 new_gaps.append(gap)
         else:
             new_gaps.append(gap)
         utils.log(
             self.loggingName, 'debug',
             'New gap coords %s' % (",".join([str(x) for x in new_gaps])))
     if not hit:
         self.svEvent.check_previous_add(blatResult)
     return new_gaps
Esempio n. 52
0
 def get_startend_missing_query_coverage(self):
     """Calculate the percentage of the contig sequence that is not realigned to the reference, only examining the
     beginning and end of the contig sequence.
     """
     missingCov = 0
     for i in self.queryCoverage:
         if i == 0:
             missingCov += 1
         else:
             break
     for i in reversed(self.queryCoverage):
         if i == 0:
             missingCov += 1
         else:
             break
     percentMissing = round(
         (float(missingCov) / float(len(self.contig.seq))) * 100, 4)
     utils.log(
         self.loggingName, 'debug',
         'Calculated %f missing coverage of blat query sequence at beginning and end'
         % percentMissing)
     return percentMissing
Esempio n. 53
0
    def extract_bam_reads(self, sampleType):
        """Wrapper for Variation extract_bam_reads function.

        Args:
            sampleType (str): Indicates a tumor ('sv') or normal ('norm') sample being processed.
        Return:
            None
        """

        # Create the file paths for the files that will be created from the read extraction.
        self.variation.setup_read_extraction_files(sampleType,
                                                   self.paths['data'],
                                                   self.name)
        bamType = 'sample'
        if sampleType == 'norm':
            bamType = 'normal'
        bamFile = self.params.get_param('%s_bam_file' % bamType)
        utils.log(
            self.loggingName, 'info', 'Extracting bam reads from %s to %s' %
            (bamFile, self.variation.files['%s_fq' % sampleType]))
        self.variation.set_var_reads(sampleType, bamFile, self.chrom,
                                     self.start, self.end, self.regionBuffer)
Esempio n. 54
0
    def check_add_br(self, qs, qe, gs, ge, br):

        '''
        '''

        utils.log(self.logging_name, 'info', 'Checking to add blat result with start %d, end %d' % (qs, qe))
        add = False
        over_perc = round((float(min(qe, ge) - max(qs, gs)) / float(qe - qs)) * 100)  # Calc % of segment overlaps with gap
        ov_right = 0 # Check overlap with other aligned segments
        if qe > ge:
            ov_right = abs(qe - ge)
        ov_left = 0
        if qs < gs:
            ov_left = abs(qs - gs)
        br.set_segment_overlap(ov_left, ov_right)
        max_seg_overlap = max(ov_right, ov_left)
        utils.log(self.logging_name, 'debug', 'Blat query segment overlaps gap by %f' % over_perc)
        utils.log(self.logging_name, 'debug', 'Max segment overlap %f' % max_seg_overlap)
        utils.log(self.logging_name, 'debug', 'Event in target %r and blat result in target %r' % (self.sv_event.in_target, br.in_target))
        if over_perc >= 50 and (max_seg_overlap < 15 or (br.in_target and self.sv_event.in_target)): # and (self.se.in_target or br.in_target):
            add = True
        utils.log(self.logging_name, 'debug', 'Add blat result to SV event %r' % add)
        return add
Esempio n. 55
0
    def get_brkpt_counts_filt(self, brkpts, sv_type):

        '''

        '''

        avg_comp, comp_vec = utils.calc_contig_complexity(self.contig.seq.value)
        brkpt_rep_filt = False
        brkpt_counts = {'n':[],'d':[],'b':[]}
        brkpt_kmers = []
        for qb in brkpts['q'][1]:
            left_idx = qb[0] - min(qb[1],5)
            right_idx = qb[0] + min(qb[2],5)
            bc = self.contig.get_contig_counts().get_counts(left_idx, right_idx, sv_type)
            brkpt_counts['n'].append(min(bc))
            brkpt_counts['d'].append(min(self.contig.get_contig_counts().get_counts((qb[0]-1), (qb[0]+1), sv_type)))
            brkpt_counts['b'].append(self.contig.get_contig_counts().get_counts(qb[0], qb[0], sv_type))
            brkpt_kmers.append(self.contig.get_kmer_locs()[qb[0]])
            brkpt_rep_filt = brkpt_rep_filt or (comp_vec[qb[0]] < (avg_comp/2))
            utils.log(self.logging_name, 'debug', 'Read count around breakpoint %d: %s'%(qb[0],",".join([str(x) for x in bc])))
        utils.log(self.logging_name, 'debug', 'Kmer count around breakpoints %s'%(",".join([str(x) for x in brkpt_kmers])))
        brkpt_rep_filt = brkpt_rep_filt or (len(filter(lambda x: x, brkpts['f'])) > 0)
        return brkpt_counts, brkpt_kmers, brkpt_rep_filt
Esempio n. 56
0
    def has_indel(self):

        '''
        '''

        has_indel = False
        for i, realigned_segment in enumerate(self.realignments):
            # br.mean_cov = self.get_mean_cov(br.qstart(), br.qend())
            #keep_clipped = (mean_cov<4 and ((br.get_nmatch_total()<30 and not br.in_repeat) or br.get_nmatch_total()>=30))
            #keep_clipped = keep_clipped or (br.in_target and mean_cov<10)
            #print nmatch, ngaps, br.mean_cov
            # if i == 0 and self.check_segment_indel(realigned_segment):
            if i == 0:
                if realigned_segment.has_gaps() and (realigned_segment.spans_query() or (len(self.realignments) == 1 and realigned_segment.in_target)):
                    has_indel = True
                    utils.log(self.logging_name, 'info', 'Contig has indel, returning %r' % has_indel)
                    self.sv_event = results.SVEvent(realigned_segment)
                    self.sv_event.set_event_type('indel')
                    return has_indel
            # else:
            #     utils.log(self.logging_name, 'debug', 'Storing clipped blat result start %d, end %d' % (realigned_segment.qstart(), realigned_segment.qend()))
                # self.clipped_qs.append((br.qstart(), br.qend(), br, i))
        utils.log(self.logging_name, 'info', 'Contig does not have indel, return %r' % has_indel)
        return has_indel
Esempio n. 57
0
    def get_param(self, key, required=False):
        """Get the parameter value in the self.opts dictionary.
        If the parameer is required to be availale, then exit the program
        and throw an error.

        Args:
            key (str): The key in the opts dictionary to access the parameter value.
                       required: Boolean value to indicate if the key should be required to
                       be in the dictionary or not.
        Returns:
            value (int, str, boolean): The value of the parameter if it is found. If the parameter is
                                       required and not found the program will exit with error. If the parameter is
                                       not required and not found, it will return None.
        Raises:
            None
        """

        value = None
        if key in self.opts:
            value = self.opts[key]
        elif required:
            utils.log(self.logging_name, 'error', 'Missing required parameter %s, exiting.' % key)
            sys.exit(1)
        return value
Esempio n. 58
0
    def filter_indel(self, svEvent):
        """ """
        indelSizeThresh = int(self.params.get_param('indel_size'))
        utils.log(self.loggingName, 'info',
                  'Checking if blat result contains an indel variant')
        blatResult = svEvent.blatResults[0][1]
        keep_br = blatResult.valid and blatResult.alignFreq < 2 and blatResult.in_target and (
            blatResult.indel_maxevent_size[0] >= indelSizeThresh)
        utils.log(self.loggingName, 'debug', 'Keep blat result %r' % keep_br)

        # Determine the uniqueness of the realignment.
        svFilterValues = svEvent.resultValues.filterValues
        uniqRealignment = svFilterValues.realignFreq < 2
        indelSize = svFilterValues.maxEventSize >= indelSizeThresh
        brkptCoverages = svFilterValues.brkptCoverages[
            0] >= self.params.get_sr_thresh('indel')
        minFlankMatches = min(svFilterValues.flankMatchPercents) >= 10.0

        if uniqRealignment and indelSize and brkptCoverages and minFlankMatches:
            utils.log(self.loggingName, 'debug',
                      'Indel meets basic filtering requirements.')
        else:
            utils.log(
                self.loggingName, 'debug',
                'Indel filtered due to non-unique realignment (%r), less than input size threshold (%r), low coverage at breakpoints (%r), or contig edge realignment not long enough (%r), filter status set to True.'
                %
                (uniqRealignment, indelSize, brkptCoverages, minFlankMatches))
            filterReasons = []
            if not uniqRealignment:
                filterReasons.append('Non-unique realignment (%d) > 2' %
                                     svFilterValues.realignFreq)
            if not indelSize:
                filterReasons.append(
                    'Max indel size (%d) is less than %d' %
                    (svFilterValues.maxEventSize, indelSizeThresh))
            if not brkptCoverages:
                filterReasons.append(
                    'Minimum coverage at breakpoints (%d) less than input threshold %d'
                    % (svFilterValues.brkptCoverages[0],
                       self.params.get_sr_thresh('indel')))
            if not minFlankMatches:
                filterReasons.append(
                    'Minimum percentage of contig sequence that realigns to the reference to the left or right of the indel event less than 10.0 percent (%d)'
                    % min(svFilterValues.flankMatchPercents))
            svEvent.set_filtered(','.join(filterReasons))
Esempio n. 59
0
    def multiple_genes(self, chrs, brkpts, anno_genes):

        '''
        '''

        mult_genes = True
        if len(set(anno_genes)) == 1:
            utils.log(self.logging_name, 'debug', 'One annotated gene among SVs breakpoints: %s' % ",".join(anno_genes))
            mult_genes = False
        elif self.dup_gene_names(anno_genes) and len(set(chrs)) == 1 and ((max(brkpts) - min(brkpts)) < 10000):
            utils.log(self.logging_name, 'debug', 'Anno genes are not the same, but similar and breakpoints are less than 10Kb from each other %s' % ",".join(anno_genes))
            mult_genes = False
        utils.log(self.logging_name, 'debug', 'Test whether SVs breakpoints are in multiple genes %r' % mult_genes)
        return mult_genes
Esempio n. 60
0
    def set_ref_data(self):
        """Write the reference sequence to a fasta file for this specific target if it does not
        exist.

        Args:
            None
        Returns:
            None
        Raise:
            None
        """

        # Write reference fasta file if needed.
        for i in range(len(self.files['target_ref_fn'])):
            fn = self.files['target_ref_fn'][i]
            direction = "forward" if fn.find("forward") != -1 else "reverse"
            utils.log(self.loggingName, 'info',
                      'Extracting refseq sequence and writing %s' % fn)
            utils.extract_refseq_fa(self.values, self.paths['ref_data'],
                                    self.params.get_param('reference_fasta'),
                                    direction, fn)

        # If using blatn for target realignment, the db must be available.
        blastn = self.params.get_param('blast')
        if blastn is not None:
            # Check if blast db files are available for each target.
            if not os.path.isfile(self.files['target_ref_fn'][0] + '.nin'):
                makedb = os.path.join(os.path.split(blastn)[0],
                                      'makeblastdb')  # Create blast db
                cmd = "%s -in %s -dbtype 'nucl' -out %s" % (
                    makedb, self.files['target_ref_fn'][0],
                    self.files['target_ref_fn'][0])
                utils.log(
                    self.loggingName, 'info',
                    'Creating blast db files for target %s with reference file %s'
                    % (self.name, self.files['target_ref_fn'][0]))
                p = subprocess.Popen(cmd,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE,
                                     shell=True)
                output, errors = p.communicate()
                if errors != '':
                    utils.log(
                        self.loggingName, 'debug',
                        'Failed to make blast db files using reference file %s'
                        % self.files['target_ref_fn'][0])