Exemple #1
0
def pcr_sequences_for_amplicon(amplicon, padding_pos5=0, padding_pos3=0, include_snps=False):
    """
    Returns the PCRSequence objects representing the specified amplicon.
    If include_snps=True, each object will include an attribute 'snps', which will
    include a list of SNP dictionaries (like the SNPDBCache object, only
    SQLAlchemy-agnostic, and 'class' in place of 'class_')

    :param padding_pos5: The amount of padding to prefix the amplicon.  Max MAX_CACHE_PADDING.
    :param padding_pos3: The amount of padding to suffix the amplicon.  Max MAX_CACHE_PADDING.
    :param include_snps: Whether to include the snps attribute on each amplicon.
    """
    if padding_pos5 < 0 or padding_pos5 > MAX_CACHE_PADDING:
        raise ValueError, "Illegal padding value: %s" % padding_pos5
    
    if padding_pos3 < 0 or padding_pos3 > MAX_CACHE_PADDING:
        raise ValueError, "Illegal padding value: %s" % padding_pos3
    
    pseqs = []
    for seq in amplicon.cached_sequences:
        main     = SimpleGenomeSequence(seq.chromosome, seq.start_pos, seq.end_pos, '+', seq.positive_amplicon)

        # this could be of a different length than requested
        padding_pos5_seq = seq.padding_pos5(padding_pos5, '+')
        padding_pos3_seq = seq.padding_pos3(padding_pos3, '+')

        prefix   = SimpleGenomeSequence(seq.chromosome,
                                        seq.start_pos-len(padding_pos5_seq),
                                        seq.start_pos-1,
                                        '+', padding_pos5_seq)
        suffix   = SimpleGenomeSequence(seq.chromosome,
                                        seq.end_pos+1,
                                        seq.end_pos+len(padding_pos3_seq),
                                        '+', padding_pos3_seq)
        pseq     = PCRSequence(main, prefix, suffix)

        if include_snps:
            snps = seq.snps_in_range(padding_pos5=len(padding_pos5_seq), padding_pos3=len(padding_pos3_seq))
            pseq.snps = [dict([(k, v) for k, v in snp.__dict__.items() if not k.startswith('_')]) for snp in snps]
            for snp in pseq.snps:
                snp['class'] = snp['class_']
        
        pseqs.append(pseq)
    return pseqs
Exemple #2
0
def sequences_snps_for_assay(config, assay, seq_source, snp_source, left_padding=0, right_padding=0, cache=True):
    sequences = []
    if not assay:
        return sequences

    if assay.cached_sequences:
        all_cached = True
        for seq in assay.cached_sequences:
            if not seq.cached(left_padding, right_padding):
                all_cached = False
                break
        if all_cached:
            for seq in assay.cached_sequences:
                amplicon = SimpleGenomeSequence(seq.chromosome, seq.start_pos, seq.end_pos, "+", seq.positive_amplicon)
                prefix = SimpleGenomeSequence(
                    seq.chromosome,
                    seq.start_pos - left_padding,
                    seq.start_pos - 1,
                    "+",
                    seq.padding_pos5(left_padding, "+"),
                )
                suffix = SimpleGenomeSequence(
                    seq.chromosome,
                    seq.end_pos + 1,
                    seq.end_pos + right_padding,
                    "+",
                    seq.padding_pos3(right_padding, "+"),
                )
                pseq = PCRSequence(amplicon, prefix, suffix)
                # TODO unify SNP object

                pseq.snps = [
                    dict([(k, v) for k, v in snp.__dict__.items() if not k.startswith("_")]) for snp in seq.snps
                ]
                for snp in pseq.snps:
                    snp["class"] = snp["class_"]

                sequences.append(pseq)

            return sequences

    if assay.assay_type == Assay.TYPE_PRIMER:
        sequences = seq_source.sequences_for_primers(assay.primer_fwd, assay.primer_rev, left_padding, right_padding)
    elif assay.assay_type == Assay.TYPE_LOCATION:
        sequence = seq_source.sequence_around_loc(
            assay.chromosome, assay.probe_pos, assay.amplicon_width, left_padding, right_padding
        )
        sequences.append(sequence)
    elif assay.assay_type == Assay.TYPE_SNP:
        snps = snp_source.snps_by_rsid(assay.snp_rsid)
        # TODO: make SNP object so that access style is same as assay
        for snp in snps:
            if snp["refUCSC"] == "-":  # deletion:
                sequences.append(
                    seq_source.sequence_around_region(
                        snp["chrom"][3:],
                        snp["chromEnd"],
                        snp["chromEnd"],
                        assay.amplicon_width,
                        left_padding,
                        right_padding,
                    )
                )
            else:
                sequences.append(
                    seq_source.sequence_around_region(
                        snp["chrom"][3:],
                        snp["chromStart"] + 1,
                        snp["chromEnd"],
                        assay.amplicon_width,
                        left_padding,
                        right_padding,
                    )
                )

    for seq in sequences:
        seq.snps = snp_source.snps_in_range(seq.chromosome, seq.start, seq.end)

    if cache:
        # TODO: library method? -- given PCR object, SNP dict?  or unify display objects
        # to and from their DB representation?
        assay.cached_sequences = []
        for seq in sequences:
            cached_seq = HG19AssayCache(
                chromosome=seq.chromosome,
                start_pos=seq.amplicon.start,
                end_pos=seq.amplicon.end,
                seq_padding_pos5=left_padding,
                seq_padding_pos3=right_padding,
                positive_sequence=seq.merged_positive_sequence.sequence,
            )

            cached_seq.amplicon_dg = dg_seq(config, cached_seq.positive_amplicon)
            cached_seq.amplicon_tm = tm_probe(config, cached_seq.positive_amplicon)
            for snp in seq.snps:
                cached_seq.snps.append(
                    SNP131AssayCache(
                        bin=snp["bin"],
                        chrom=snp["chrom"],
                        chromStart=snp["chromStart"],
                        chromEnd=snp["chromEnd"],
                        name=snp["name"],
                        score=snp["score"],
                        strand=snp["strand"],
                        refNCBI=snp["refNCBI"],
                        refUCSC=snp["refUCSC"],
                        observed=snp["observed"],
                        molType=snp["molType"],
                        class_=snp["class"],
                        valid=snp["valid"],
                        avHet=snp["avHet"],
                        avHetSE=snp["avHetSE"],
                        func=snp["func"],
                        locType=snp["locType"],
                        weight=snp["weight"],
                    )
                )
            assay.cached_sequences.append(cached_seq)

        Session.commit()

    return sequences
Exemple #3
0
    def cut(self):
        left_padding = self.form_result['left_padding']
        right_padding = self.form_result['right_padding']
        enzyme = Session.query(Enzyme).get(self.form_result['enzyme'])
        cutseq = enzyme.cutseq
        
        # TODO change to single amplicon?
        if self.form_result['assay_id']:
            assay = Session.query(SequenceGroup).get(self.form_result['assay_id'])
            amplicon_tuples = pcr_sequences_snps_for_group(assay, padding_pos5=left_padding, padding_pos3=right_padding)
            sequences = []
            for amp, pseqs in amplicon_tuples:
                sequences.extend(pseqs)
        else:
            manual_seq = PCRSequence(SimpleGenomeSequence(0, 0, len(self.form_result['positive_sequence'])-1, '+',
                                                          full_sequence=self.form_result['positive_sequence']))
            manual_seq.snps = []
            sequences = [manual_seq]
        
        # TODO: this is arbitrary
        location_cut_data = self.__enzyme_cut_locations(sequences[0], [enzyme])
        
        # TODO support multiple sequences, somehow.
        
        pos_seq = sequences[0].merged_positive_sequence
        total_width = len(pos_seq)
        re_width_pct = 100*float(len(cutseq))/total_width
        
        enzyme_cut_data = location_cut_data[self.form_result['enzyme']]
        
        positive_matches = []
        negative_matches = []
        return_dict = {}
        snp_dict = dict([(s['name'], s) for s in sequences[0].snps])
        # keys here are going to be amplicon_cuts, left_cuts and right_cuts, left_cut
        for k, v in sorted(enzyme_cut_data.items()):
            blank, original_positives, original_negatives = v[0]
            snp_positives = []
            snp_negatives = []
            cancel_positives = []
            cancel_negatives = []
            for cuts in v[1:]:
                snp_name, shifted_positives, shifted_negatives = cuts
                snp = snp_dict[snp_name]
                # TODO: this is an oversimplification but will probably only result
                # in a shift in a particular restriction site
                #
                # TODO: I think this is sketchy right at the edges, needs to be tested. (> vs >=, etc)
                # TODO: the code could stand to be more compact as well.
                if len(shifted_positives) > len(original_positives):
                    found = False
                    for start, end, strand in shifted_positives:
                        if (snp['chromEnd'] >= pos_seq.start+start-1 and snp['chromEnd'] <= pos_seq.start+end) or \
                           (snp['chromStart'] >= pos_seq.start+start-1 and snp['chromStart'] <= pos_seq.start+end):
                            snp_positives.append((start, end, strand))
                            found = True
                    if not found:
                        pass
                        #raise Exception, "ERROR: additional positive strand restriction site not found by analyzing SNPs"
                
                elif len(shifted_positives) < len(original_positives):
                    found = False
                    for start, end, strand in original_positives:
                        if (snp['chromEnd'] >= pos_seq.start+start-1 and snp['chromEnd'] <= pos_seq.start+end) or \
                           (snp['chromStart'] >= pos_seq.start+start-1 and snp['chromStart'] <= pos_seq.start+end) or \
                           (pos_seq.start+start-1 >= snp['chromStart'] and pos_seq.start+end <= snp['chromEnd']):
                            if (start, end, strand) not in cancel_positives:
                                cancel_positives.append((start, end, strand))
                            found = True
                    if not found:
                        pass
                        #raise Exception, "ERROR: cancelled positive strand restriction site not found by analyzing SNPs"
                           
                if len(shifted_negatives) > len(original_negatives):
                    # find where the new snp is
                    found = False
                    for start, end, strand in shifted_negatives:
                        if (snp['chromEnd'] >= pos_seq.end-(end+1) and snp['chromEnd'] <= pos_seq.end-start) or \
                           (snp['chromStart'] >= pos_seq.end-(end+1) and snp['chromStart'] <= pos_seq.end-start):
                            snp_negatives.append((start, end, strand))
                            found = True
                    if not found:
                        pass
                        # insertion screws you here.
                        #raise Exception, (snp['chromEnd'], snp['chromStart'], pos_seq.end, pos_seq.end-(shifted_negatives[0][1]+1), pos_seq.end-(shifted_negatives[0][0]))
                        #raise Exception, "ERROR: additional negative strand restriction site not found by analyzing SNPs"
                    
                elif len(shifted_negatives) < len(original_negatives):
                    found = False
                    for start, end, strand in original_negatives:
                        if (snp['chromEnd'] >= pos_seq.end-(end+1) and snp['chromEnd'] <= pos_seq.end-start) or \
                           (snp['chromStart'] >= pos_seq.end-(end+1) and snp['chromStart'] <= pos_seq.end-start) or \
                           (pos_seq.end-(end+1) >= snp['chromStart'] and pos_seq.end-start <= snp['chromEnd']):
                            if (start, end, strand) not in cancel_negatives:
                                cancel_negatives.append((start, end, strand))
                            found = True
                    
                    if not found:
                        pass
                        #raise Exception, "ERROR: cancelling negative strand restriction site not found by analyzing SNPs"
            
            for tup in cancel_positives:
                original_positives.remove(tup)
            for tup in cancel_negatives:
                original_negatives.remove(tup)
            
            return_dict[k] = len(original_positives) + len(cancel_positives) \
                             + len(original_negatives) + len(cancel_negatives) \
                             + len(snp_positives) + len(snp_negatives)
            
            for start, end, strand in original_positives:
                positive_matches.append({'offset': start, 'pos': '%s%%' % (start*100.0/total_width), 'class': 'stable_re_site'})
            
            for start, end, strand in original_negatives:
                negative_matches.append({'offset': start, 'pos': '%s%%' % (100-(start*100.0/total_width)-re_width_pct), 'class': 'stable_re_site'})
            
            for start, end, strand in snp_positives:
                positive_matches.append({'offset': start, 'pos': '%s%%' % (start*100.0/total_width), 'class': 'snp_re_site'})
            
            for start, end, strand in snp_negatives:
                negative_matches.append({'offset': start, 'pos': '%s%%' % (100-(start*100.0/total_width)-re_width_pct), 'class': 'snp_re_site'})
            
            for start, end, strand in cancel_positives:
                positive_matches.append({'offset': start, 'pos': '%s%%' % (start*100.0/total_width), 'class': 'snp_cancel_re_site'})
            
            for start, end, strand in cancel_negatives:
                negative_matches.append({'offset': start, 'pos': '%s%%' % (100-(start*100.0/total_width)-re_width_pct), 'class': 'snp_cancel_re_site'})
                
        return_dict['positive_cuts'] = positive_matches
        return_dict['negative_cuts'] = negative_matches
        return_dict['re_width_pct'] = "%s%%" % re_width_pct

        # future out amplicon position
        amplicon_start = left_padding
        amplicon_end = len(pos_seq) - (right_padding+1)
        
        left_offsets = [match['offset'] for match in positive_matches if match['offset'] < (amplicon_start - len(cutseq))]
        if left_offsets:
            rightmost_left = max(left_offsets)+len(cutseq)
        else:
            rightmost_left = None
        
        right_offsets = [match['offset'] for match in positive_matches if match['offset'] > amplicon_end]
        if right_offsets:
            leftmost_right = min(right_offsets)
        else:
            leftmost_right = None
        
        amplicon_cuts = [match for match in positive_matches if match['offset'] >= amplicon_start and match['offset'] <= amplicon_end]
        
        # todo: bug if the cutters are asymmetric and the negative cutsite is shorter (redmine 669)
        if rightmost_left is not None and leftmost_right is not None and len(amplicon_cuts) == 0:
            inner_len = leftmost_right - rightmost_left
            inner_seq = pos_seq.sequence[rightmost_left:leftmost_right]
            inner_gc = gc_content(inner_seq)
            left_offset = amplicon_start - rightmost_left
            right_offset = leftmost_right - amplicon_end

            return_dict['fragment'] = {'len': inner_len,
                                       'loff': left_offset,
                                       'roff': right_offset,
                                       'gc': "%.2f%%" % (inner_gc*100)}
            
        return return_dict