コード例 #1
0
    def products(self):

        if self._products:
            return self._products

        self._products = []

        for fp in self.fwd_primers:
            for rp in self.rev_primers:

                if self.template.circular and fp.position>rp.position:
                    tmpl = self.template.shifted(fp.position-len(fp.footprint))
                    tmpl = tmpl._multiply_circular(2)
                    tmpl = tmpl[:len(self.template) - (fp.position - rp.position) + len(rp.footprint) + len(fp.footprint)]
                    #print len(self.template) - (fp.position - rp.position) + len(rp.footprint) + len(fp.footprint)
                    #print len(tmpl)
                elif self.template.circular:
                    tmpl = self.template._multiply_circular(3)
                    tmpl = tmpl[fp.position-len(fp.footprint)+len(self.template):rp.position+len(rp.footprint)+len(self.template)]
                else:
                    tmpl = self.template[fp.position-len(fp.footprint):rp.position+len(rp.footprint)]

                prd = ( Dseqrecord(fp.tail) + tmpl + Dseqrecord(rp.tail).reverse_complement())

                prd.add_feature( 0, len(fp), label=fp.id)
                prd.add_feature( len(prd)-len(rp),len(prd),label=rp.id, strand=-1)

                #prd.seq = fp.seq+tmpl.seq[len(fp.footprint):len(tmpl)-len(rp.footprint)]+rp.seq.reverse_complement()

                #features = tmpl[fp.position-len(fp.footprint):rp.position+len(rp.footprint)].features
                #print   fp.position-len(fp.footprint), rp.position+len(rp.footprint), features
                #print "<<<<<<<<<<<<<<",features
                #prd.features = [f._shift(len(fp.tail)) for f in features]
                # description = Genbank LOCUS max 16 chars

                prd.name = "{0}bp_PCR_prod".format(len(prd))[:16]
                prd.id = "{0}bp {1}".format( str(len(prd))[:14], prd.seguid() )
                prd.description="Product_{0}_{1}".format( fp.description,
                                                          rp.description)

                self._products.append( Amplicon(prd,
                                                template=tmpl,
                                                forward_primer=fp,
                                                reverse_primer=rp,
                                                saltc=50,
                                                forward_primer_concentration=1000,
                                                reverse_primer_concentration=1000))
                assert " " not in str(prd.seq.watson)
                assert " " not in str(prd.seq.crick)

        return self._products
コード例 #2
0
ファイル: assembly.py プロジェクト: shunsunsun/pydna
    def _assemble(self):

        for dr in self.dsrecs:
            if dr.name in ("", ".", "<unknown name>", None):
                dr.name = "frag{}".format(len(dr))

        if self.only_terminal_overlaps:
            algorithm = terminal_overlap
        else:
            algorithm = common_sub_strings

        # analyze_overlaps
        cols = {}
        for dsrec in self.dsrecs:
            dsrec.features = [f for f in dsrec.features if f.type != "overlap"]
            dsrec.seq = Dseq(dsrec.seq.todata)
        rcs = {dsrec: dsrec.rc() for dsrec in self.dsrecs}
        matches = []
        dsset = OrderedSet()

        for a, b in itertools.combinations(self.dsrecs, 2):
            match = algorithm(
                str(a.seq).upper(),
                str(b.seq).upper(), self.limit)
            if match:
                matches.append((a, b, match))
                dsset.add(a)
                dsset.add(b)
            match = algorithm(
                str(a.seq).upper(),
                str(rcs[b].seq).upper(), self.limit)
            if match:
                matches.append((a, rcs[b], match))
                dsset.add(a)
                dsset.add(rcs[b])
                matches.append(
                    (rcs[a], b, [(len(a) - sa - le, len(b) - sb - le, le)
                                 for sa, sb, le in match]))
                dsset.add(b)
                dsset.add(rcs[a])

        self.no_of_olaps = 0

        for a, b, match in matches:
            for start_in_a, start_in_b, length in match:
                self.no_of_olaps += 1
                chksum = a[start_in_a:start_in_a + length].seguid()
                #assert chksum == b[start_in_b:start_in_b+length].seguid()

                try:
                    fcol, revcol = cols[chksum]
                except KeyError:
                    fcol = '#%02X%02X%02X' % (random.randint(
                        175, 255), random.randint(
                            175, 255), random.randint(175, 255))
                    rcol = '#%02X%02X%02X' % (random.randint(
                        175, 255), random.randint(
                            175, 255), random.randint(175, 255))
                    cols[chksum] = fcol, rcol

                qual = {
                    "note": ["olp_{}".format(chksum)],
                    "chksum": [chksum],
                    "ApEinfo_fwdcolor": [fcol],
                    "ApEinfo_revcolor": [rcol]
                }

                if not chksum in [
                        f.qualifiers["chksum"][0]
                        for f in a.features if f.type == "overlap"
                ]:
                    a.features.append(
                        SeqFeature(FeatureLocation(start_in_a,
                                                   start_in_a + length),
                                   type="overlap",
                                   qualifiers=qual))
                if not chksum in [
                        f.qualifiers["chksum"][0]
                        for f in b.features if f.type == "overlap"
                ]:
                    b.features.append(
                        SeqFeature(FeatureLocation(start_in_b,
                                                   start_in_b + length),
                                   type="overlap",
                                   qualifiers=qual))
        for ds in dsset:
            ds.features = sorted([f for f in ds.features],
                                 key=operator.attrgetter("location.start"))

        self.analyzed_dsrecs = list(dsset)

        # Create graph

        self.G = nx.MultiDiGraph(multiedges=True,
                                 name="original graph",
                                 selfloops=False)
        self.G.add_node('5')
        self.G.add_node('3')

        for i, dsrec in enumerate(self.analyzed_dsrecs):

            overlaps = sorted({
                f.qualifiers['chksum'][0]: f
                for f in dsrec.features if f.type == 'overlap'
            }.values(),
                              key=operator.attrgetter('location.start'))

            if overlaps:
                overlaps = ([
                    SeqFeature(FeatureLocation(0, 0),
                               type='overlap',
                               qualifiers={'chksum': ['5']})
                ] + overlaps + [
                    SeqFeature(FeatureLocation(len(dsrec), len(dsrec)),
                               type='overlap',
                               qualifiers={'chksum': ['3']})
                ])

                for olp1, olp2 in itertools.combinations(overlaps, 2):

                    n1 = olp1.qualifiers['chksum'][0]
                    n2 = olp2.qualifiers['chksum'][0]

                    if n1 == '5' and n2 == '3':
                        continue

                    s1, e1, s2, e2 = (
                        olp1.location.start.position,
                        olp1.location.end.position,
                        olp2.location.start.position,
                        olp2.location.end.position,
                    )

                    source_fragment = Fragment(dsrec, s1, e1, s2, e2, i)

                    self.G.add_edge(n1,
                                    n2,
                                    frag=source_fragment,
                                    weight=s1 - e1,
                                    i=i)

        #linear assembly

        linear_products = defaultdict(list)

        for path in all_simple_paths_edges(self.G,
                                           '5',
                                           '3',
                                           data=True,
                                           cutoff=self.max_nodes):

            pred_frag = copy(path[0][2].values().pop()['frag'])
            source_fragments = [
                pred_frag,
            ]

            if pred_frag.start2 < pred_frag.end1:
                result = pred_frag[pred_frag.start2 +
                                   (pred_frag.end1 -
                                    pred_frag.start2):pred_frag.end2]
            else:
                result = pred_frag[pred_frag.end1:pred_frag.end2]

            for first_node, second_node, edgedict in path[1:]:

                edgedict = edgedict.values().pop()

                f = copy(edgedict['frag'])

                f.alignment = pred_frag.alignment + pred_frag.start2 - f.start1
                source_fragments.append(f)

                if f.start2 > f.end1:
                    result += f[f.end1:f.end2]
                else:
                    result += f[f.start2 + (f.end1 - f.start2):f.end2]

                pred_frag = f

            add = True
            for lp in linear_products[len(result)]:
                if (str(result.seq).lower() == str(lp.seq).lower()
                        or str(result.seq).lower() == str(
                            lp.seq.reverse_complement()).lower()):
                    add = False
            for dsrec in self.dsrecs:
                if (str(result.seq).lower() == str(dsrec.seq).lower()
                        or str(result.seq).lower() == str(
                            dsrec.seq.reverse_complement()).lower()):
                    add = False
            if add:
                linear_products[len(result)].append(
                    Contig(result, source_fragments))

        self.linear_products = list(
            itertools.chain.from_iterable(
                linear_products[size]
                for size in sorted(linear_products, reverse=True)))

        # circular assembly

        self.cG = self.G.copy()
        self.cG.remove_nodes_from(('5', '3'))
        #circular_products=defaultdict(list)
        circular_products = {}

        for pth in all_circular_paths_edges(self.cG):

            ns = min(enumerate(pth), key=lambda x: x[1][2]['i'])[0]

            path = pth[ns:] + pth[:ns]

            pred_frag = copy(path[0][2]['frag'])

            source_fragments = [
                pred_frag,
            ]

            if pred_frag.start2 < pred_frag.end1:
                result = pred_frag[pred_frag.start2 +
                                   (pred_frag.end1 -
                                    pred_frag.start2):pred_frag.end2]
            else:
                result = pred_frag[pred_frag.end1:pred_frag.end2]

            result.seq = Dseq(str(result.seq))

            for first_node, second_node, edgedict in path[1:]:

                f = copy(edgedict['frag'])

                f.alignment = pred_frag.alignment + pred_frag.start2 - f.start1
                source_fragments.append(f)

                if f.start2 > f.end1:
                    nxt = f[f.end1:f.end2]
                else:
                    nxt = f[f.start2 + (f.end1 - f.start2):f.end2]
                nxt.seq = Dseq(str(nxt.seq))
                result += nxt

                pred_frag = f

            #add=True
            #for cp in circular_products[len(result)]:
            #    if (str(result.seq).lower() in str(cp.seq).lower()*2
            #        or
            #        str(result.seq).lower() == str(cp.seq.reverse_complement()).lower()*2):
            #        pass
            #        add=False
            #        print "##--"
            #if add:
            #    circular_products[len(result)].append( Contig( Dseqrecord(result, circular=True), source_fragments))

            r = Dseqrecord(result, circular=True)
            circular_products[r.cseguid()] = Contig(r, source_fragments)

        #self.circular_products = list(itertools.chain.from_iterable(circular_products[size] for size in sorted(circular_products, reverse=True)))
        self.circular_products = sorted(circular_products.values(),
                                        key=len,
                                        reverse=True)
コード例 #3
0
ファイル: assembly.py プロジェクト: hgbrian/pydna
    def _assemble(self):

        for dr in self.dsrecs:
            if dr.name in ("",".", "<unknown name>", None):
                dr.name = "frag{}".format(len(dr))

        if self.only_terminal_overlaps:
            algorithm = terminal_overlap
        else:
            algorithm = common_sub_strings

        # analyze_overlaps
        cols = {}
        for dsrec in self.dsrecs:
            dsrec.features = [f for f in dsrec.features if f.type!="overlap"]
            dsrec.seq = Dseq(dsrec.seq.todata)
        rcs = {dsrec:dsrec.rc() for dsrec in self.dsrecs}
        matches=[]
        dsset=OrderedSet()

        for a, b in itertools.combinations(self.dsrecs, 2):
            match = algorithm( str(a.seq).upper(),
                               str(b.seq).upper(),
                               self.limit)
            if match:
                matches.append((a, b, match))
                dsset.add(a)
                dsset.add(b)
            match = algorithm( str(a.seq).upper(),
                               str(rcs[b].seq).upper(),
                               self.limit)
            if match:
                matches.append((a, rcs[b], match))
                dsset.add(a)
                dsset.add(rcs[b])
                matches.append((rcs[a], b, [(len(a)-sa-le,len(b)-sb-le,le) for sa,sb,le in match]))
                dsset.add(b)
                dsset.add(rcs[a])

        self.no_of_olaps=0

        for a, b, match in matches:
            for start_in_a, start_in_b, length in match:
                self.no_of_olaps+=1
                chksum = a[start_in_a:start_in_a+length].seguid()
                #assert chksum == b[start_in_b:start_in_b+length].seguid()

                try:
                    fcol, revcol = cols[chksum]
                except KeyError:
                    fcol = '#%02X%02X%02X' % (random.randint(175,255),random.randint(175,255),random.randint(175,255))
                    rcol = '#%02X%02X%02X' % (random.randint(175,255),random.randint(175,255),random.randint(175,255))
                    cols[chksum] = fcol,rcol

                qual      = {"note"             : ["olp_{}".format(chksum)],
                             "chksum"           : [chksum],
                             "ApEinfo_fwdcolor" : [fcol],
                             "ApEinfo_revcolor" : [rcol]}

                if not chksum in [f.qualifiers["chksum"][0] for f in a.features if f.type == "overlap"]:
                    a.features.append( SeqFeature( FeatureLocation(start_in_a,
                                                                   start_in_a + length),
                                                                   type = "overlap",
                                                                   qualifiers = qual))
                if not chksum in [f.qualifiers["chksum"][0] for f in b.features if f.type == "overlap"]:
                    b.features.append( SeqFeature( FeatureLocation(start_in_b,
                                                                   start_in_b + length),
                                                                   type = "overlap",
                                                                   qualifiers = qual))
        for ds in dsset:
            ds.features = sorted([f for f in ds.features], key = operator.attrgetter("location.start"))

        self.analyzed_dsrecs = list(dsset)


        # Create graph

        self.G=nx.MultiDiGraph(multiedges=True, name ="original graph" , selfloops=False)
        self.G.add_node( '5' )
        self.G.add_node( '3' )

        for i, dsrec in enumerate(self.analyzed_dsrecs):

            overlaps = sorted( list({f.qualifiers['chksum'][0]:f for f in dsrec.features
                                if f.type=='overlap'}.values()),
                               key = operator.attrgetter('location.start'))

            if overlaps:
                overlaps = ([SeqFeature(FeatureLocation(0, 0),
                             type = 'overlap',
                             qualifiers = {'chksum':['5']})]+
                             overlaps+
                            [SeqFeature(FeatureLocation(len(dsrec),len(dsrec)),
                                        type = 'overlap',
                                        qualifiers = {'chksum':['3']})])

                for olp1, olp2 in itertools.combinations(overlaps, 2):

                    n1 = olp1.qualifiers['chksum'][0]
                    n2 = olp2.qualifiers['chksum'][0]

                    if n1 == '5' and n2=='3':
                        continue

                    s1,e1,s2,e2 = (olp1.location.start.position,
                                   olp1.location.end.position,
                                   olp2.location.start.position,
                                   olp2.location.end.position,)

                    source_fragment = Fragment(dsrec,s1,e1,s2,e2,i)

                    self.G.add_edge( n1, n2,
                                     frag=source_fragment,
                                     weight = s1-e1,
                                     i = i)

        #linear assembly

        linear_products=defaultdict(list)

        for path in all_simple_paths_edges(self.G, '5', '3', data=True, cutoff=self.max_nodes):

            pred_frag = copy(list(path[0][2].values()).pop()['frag'])
            source_fragments = [pred_frag, ]

            if pred_frag.start2<pred_frag.end1:
                result=pred_frag[pred_frag.start2+(pred_frag.end1-pred_frag.start2):pred_frag.end2]
            else:
                result=pred_frag[pred_frag.end1:pred_frag.end2]

            for first_node, second_node, edgedict in path[1:]:

                edgedict = list(edgedict.values()).pop()

                f  = copy(edgedict['frag'])

                f.alignment =  pred_frag.alignment + pred_frag.start2- f.start1
                source_fragments.append(f)

                if f.start2>f.end1:
                    result+=f[f.end1:f.end2]
                else:
                    result+=f[f.start2+(f.end1-f.start2):f.end2]

                pred_frag = f

            add=True
            for lp in linear_products[len(result)]:
                if (str(result.seq).lower() == str(lp.seq).lower()
                    or
                    str(result.seq).lower() == str(lp.seq.reverse_complement()).lower()):
                    add=False
            for dsrec in self.dsrecs:
                if (str(result.seq).lower() == str(dsrec.seq).lower()
                    or
                    str(result.seq).lower() == str(dsrec.seq.reverse_complement()).lower()):
                    add=False
            if add:
                linear_products[len(result)].append(Contig( result, source_fragments))

        self.linear_products = list(itertools.chain.from_iterable(linear_products[size] for size in sorted(linear_products, reverse=True)))


        # circular assembly

        self.cG = self.G.copy()
        self.cG.remove_nodes_from(('5','3'))
        #circular_products=defaultdict(list)
        circular_products={}

        for pth in all_circular_paths_edges(self.cG):

            ns = min(enumerate(pth), key = lambda x:x[1][2]['i'])[0]

            path = pth[ns:]+pth[:ns]

            pred_frag = copy(path[0][2]['frag'])

            source_fragments = [pred_frag, ]

            if pred_frag.start2<pred_frag.end1:
                result=pred_frag[pred_frag.start2+(pred_frag.end1-pred_frag.start2):pred_frag.end2]
            else:
                result=pred_frag[pred_frag.end1:pred_frag.end2]

            result.seq = Dseq(str(result.seq))

            for first_node, second_node, edgedict in path[1:]:

                f  = copy(edgedict['frag'])

                f.alignment =  pred_frag.alignment + pred_frag.start2- f.start1
                source_fragments.append(f)

                if f.start2>f.end1:
                    nxt = f[f.end1:f.end2]
                else:
                    nxt =f[f.start2+(f.end1-f.start2):f.end2]
                nxt.seq = Dseq(str(nxt.seq))
                result+=nxt

                pred_frag = f

            #add=True
            #for cp in circular_products[len(result)]:
            #    if (str(result.seq).lower() in str(cp.seq).lower()*2
            #        or
            #        str(result.seq).lower() == str(cp.seq.reverse_complement()).lower()*2):
            #        pass
            #        add=False
            #        print "##--"
            #if add:
            #    circular_products[len(result)].append( Contig( Dseqrecord(result, circular=True), source_fragments))

            r = Dseqrecord(result, circular=True)
            circular_products[r.cseguid()] = Contig(r, source_fragments )


        #self.circular_products = list(itertools.chain.from_iterable(circular_products[size] for size in sorted(circular_products, reverse=True)))
        self.circular_products = sorted(list(circular_products.values()), key=len, reverse=True)