예제 #1
0
if 5 in _RunExercise:
    print('\n---Exercise 5---')
    id = "PAX-6.5"
    format = "fasta"
    record = SeqIO.read(open(id + "." + format), format)
    print("Record:\n", record)
    record.seq.alphabet = IUPAC.unambiguous_dna
    print("\nAlphabet altered to IUPAC.unambiguous_dna !!!")
    accNb = record.id.split("|")[3]
    print("Access number: ", accNb)
    record.name = accNb
    record.id = accNb
    print("record.name and record.id have been altered !!!")

    feature = SeqFeature()
    feature.type = "gene"
    feature.location = FeatureLocation(18, 200)
    feature.strand = -1
    record.features.append(feature)
    print("record.features: ", record.features)
    print("\nRecord:\n", record)

    count = SeqIO.write(record, open(id + ".gb", "w"), "genbank")
    print("Converted %i records" % count)

if 6 in _RunExercise:
    print('\n---ORF---')

    mail = ''
    id = "NC_009926"
예제 #2
0
def redigest_code():
    argscheck()
    ### making outfile
    # output file
    if entry2_input.get() == "":
        outfile = 'redigest.'+ TIME + '.out'
    else:
        outfile=entry2_input.get()
    out_file = open(outfile, 'wt+')

    ### processing gene sequences
    if entry7_input.get() == "Multifasta gene file":
        genomeSeq = "N"
    elif entry7_input.get() == "Single genome sequence":
        genomeSeq = "Y"
    if genomeSeq == 'N':
        ### making report file
        verbosity = 'Y'
        report_file = outfile + '.csv'
        # open report file
        RF = open(report_file, 'wt+')
        # reverse complement the reverse primer
        if entry6_input.get() == "":
            reverse = ""
        else:
            reverse = entry6_input.get()
        reverse=str(Seq(reverse).reverse_complement())
        # counter for the NAME
        count=1
        ### iterating sequences
        input_file = entry1_input.get()
        infile= open(input_file, 'r')

        # input file format
        if entry8_input.get() == "Fasta":
            informat = "fasta"
        elif entry8_input.get() == "Genbank":
            informat = "genbank"
        for record in SeqIO.parse(infile, informat):
            header=record.id
            array=str(record.seq)
            NAME=str('RED' + TIME + str(count))
            #

            if informat == 'genbank':
                desc=str(', '.join(list(record.annotations["taxonomy"])))
            else:
                desc = ''
            ## adding primer sequence if provided
            if entry5_input.get() == "":
                forward = ""
            else:
                forward = entry5_input.get()
            if forward is not None:
                Farray=''.join(forward + array)
            else:
                Farray=array
            ### adding primer sequence if provided
            if entry6_input.get() == "":
                reverse = ""
            else:
                reverse = entry6_input.get()
            if reverse is not None:
                FarrayR=''.join(Farray.strip('\n') + reverse)
            else:
                FarrayR=Farray
            ### orientation based on tagged
            if entry4_input.get() == "Forward":
                tagg = str("F")
            elif entry4_input.get() == "Reverse":
                tagg = str("R")
            tagged=tagg.upper()
            if tagged == 'R':
                FarrayRseqFR=str(Seq(FarrayR).reverse_complement())
                SubFeat="TRF_RevComp"
            else:
                FarrayRseqFR=str(FarrayR)
                SubFeat="TRF"
            ### Restriction Enzyme check from list
            enzyme = entry3_input.get()
            enzyme_RE = RestrictionBatch([enzyme])
            ### search the restriction sites position in sequence
            FarrayRseqFR_RE=enzyme_RE.search(Seq(FarrayRseqFR))
            ### convert the dict to the list and indexing
            index=list(FarrayRseqFR_RE.values())[0]
            ### checking if restriction site is present or sequence will be uncut
            if not index:
                fragment=len(FarrayRseqFR)
            else:
                fragment=index[0]
            ### adding size to header and trimming sequence to terminal fragment length
            if not index:
                ### non-cut fragment header
                FastaHeader=NAME + "|" + str(len(FarrayRseqFR)) + "_bp" + "|" + header
                ### non-cut fragment sequence
                FastaSeq=FarrayRseqFR[:len(FarrayRseqFR)]
                Feat=SeqFeature(FeatureLocation(start=0, end=len(FarrayRseqFR)), type="REDigest", ref=SubFeat)
            else:
                ### cut fragment header
                FastaHeader=NAME + "|" + str(fragment) + "_bp" + "|" + header
                ### cut fragment sequence and slicing to the fragment length
                FastaSeq=FarrayRseqFR[:fragment]
                Feat=SeqFeature(FeatureLocation(start=0, end=fragment), type="REDigest", ref=SubFeat)
            ### terminal-screen output, info about sequence header and all the fragments
                ### based of verbosity

            if verbosity == 'Y':
                print(" ", FastaHeader, '\t', FarrayRseqFR_RE)
            ### counter for the locus name
            count +=1
            ### seq object
            FastaSequence = SeqRecord(Seq(FastaSeq, IUPAC.IUPACAmbiguousDNA()), FastaHeader, description=desc, name=NAME)
            ### append features to seqobject
            FastaSequence.features.append(Feat)
            ### seq object
            if entry9_input.get() == "Fasta":
                outformat = "fasta"
            elif entry9_input.get() == "Genbank":
                outformat = "genbank"
            if outformat == 'genbank':
                SeqIO.write(FastaSequence, out_file, outformat)
            else:
                SeqIO.write(FastaSequence, out_file, "fasta-2line")
            ### writing progress to file too
            print(FastaHeader, '\t', FarrayRseqFR_RE, file=RF)
            #####################

############################################################### Genome
    else:
        ### parsing genome sequence
        ### making report file
        if entry9_input.get() == "Fasta":
            outformat = "fasta"
        elif entry9_input.get() == "Genbank":
            outformat = "genbank"
        if entry2_input.get() == "":
            outfile = 'redigest.'+ TIME + '.out'
        else:
            outfile=entry2_input.get()
        report_file = outfile + '_RF.csv'
        # open report file
        RF = open(report_file, 'wt+')
        print("Individual restriction fragments", file=RF)
        print("[WRITING:] Individual restriction fragments to file:", report_file)
        input_file = entry1_input.get()
        infile= open(input_file, 'r')
        # input file format
        if entry8_input.get() == "Fasta":
            informat = "fasta"
        elif entry8_input.get() == "Genbank":
            informat = "genbank"
        for record in SeqIO.parse(infile, informat):
            Gen_header=record.id
            Gen_array=str(record.seq)
            #
            if informat == 'genbank':
                desc=str(', '.join(list(record.annotations["taxonomy"])))
            else:
                desc = ''
            ### Restriction Enzyme check from list
            enzyme = entry3_input.get()
            enzyme_RE = RestrictionBatch([enzyme])
            ### search the restriction sites position in sequence
            Gen_array_RE=enzyme_RE.search(Seq(Gen_array))
            Gen_array_RE_V = list(Gen_array_RE.values())[0]
            #
            ID0 = 0
            ID_min = min(Gen_array_RE_V)
            ID_max = max(Gen_array_RE_V)
            ID1 = 0
            ID2 = 1
            # first fragment from first nt to first cut
            GenFastaSeq=Gen_array[0:ID_min]
            GenFastaSeqLen = len(GenFastaSeq)
            GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header
            # verbosity
            verbosity = "Y"
            if verbosity == 'Y':
                print(" ", GenFastaHeader)
            # seq object
            GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc)
            ### seq object to file
            if outformat == 'genbank':
                SeqIO.write(GenSeqRec, out_file, outformat)
            elif outformat == 'fasta':
                SeqIO.write(GenSeqRec, out_file, outformat)
            # report to file
            print(GenFastaHeader, file=RF)
            #
            for GenomeFragment in Gen_array_RE_V:
                while ID2 < len(Gen_array_RE_V):
                    GenFastaSeq=Gen_array[Gen_array_RE_V[ID1]:Gen_array_RE_V[ID2]]
                    GenFastaSeqLen = len(GenFastaSeq)
                    GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header
                    # verbosity
                    if verbosity == 'Y':
                        print(" ", GenFastaHeader)
                    # increment value for index
                    ID1 += 1
                    ID2 += 1
                    # seq object
                    GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc)
                    ### seq object to file
                    if outformat == 'genbank':
                        SeqIO.write(GenSeqRec, out_file, outformat)
                    elif outformat == 'fasta':
                        SeqIO.write(GenSeqRec, out_file, outformat)
                    # report to file
                    print(GenFastaHeader, file=RF)
            # last fragment from last cut to last nt
            GenFastaSeq=Gen_array[ID_max:]
            GenFastaSeqLen = len(GenFastaSeq)
            GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header
            # verbosity
            if verbosity == 'Y':
                print(" ", GenFastaHeader)
            # seq object
            GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc)
            ### seq object to file
            if outformat == 'genbank':
                SeqIO.write(GenSeqRec, out_file, outformat)
            else:
                SeqIO.write(GenSeqRec, out_file, outformat)
            # report to file
            print(GenFastaHeader, file=RF)
            # getting all sequences from first nt to respective cut
            report_file2 = outfile + '_TRF.csv'
            # open report file
            TRF2 = open(report_file2, 'wt+')
            print("Terminal restriction fragments: from nucleotide 1 to respective cuts", file=TRF2)
            print("[WRITING:] Terminal restriction fragments, from nucleotide 1 to respective cuts to file:", report_file2)
            for GenomeFragment in Gen_array_RE_V:
                GenFastaHeader=Gen_header + "|" + str(GenomeFragment) + "_bp|" + Gen_header
                GenFastaSeq=Gen_array[:GenomeFragment]
                ### seq object
                GenFastaSequence = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader,
                                             description=desc)
                ### terminal-screen output, info about sequence header and all the fragments
                ### based of verbosity
                if verbosity == 'Y':
                    print(" ", GenFastaHeader)
                ### writing progress to file too
                print(GenFastaHeader, file=TRF2)
        ## close files
        TRF2.close()
        RF.close()
    # final close
    out_file.close()
    infile.close()
예제 #3
0
    def run(self, record):
        logging.info('Detecting BGCs using %s model in %s', self.detector_label, record.id)

        protein_features = util.get_protein_features(record)
        proteins_by_id = util.get_proteins_by_id(protein_features)
        pfam_features = util.get_pfam_features(record)

        if not len(pfam_features):
            logging.warning('Warning: No Pfam domains in record %s, skipping BGC detection', record.id)
            return

        # Filter out previous clusters detected with the same detector label
        num_prev_features = len(record.features)
        record.features = [f for f in record.features if
                           not(f.type == 'cluster' and f.qualifiers.get('detector_label') == [self.detector_label])]
        num_removed_features = num_prev_features - len(record.features)
        if num_removed_features:
            logging.warning('Warning: Removed %s previously clusters detected clusters with same label "%s". '
                  'Use --label DeepBGCMyLabel to preserve original clusters and add second set of clusters detected '
                  'with same model but different parameters.', num_removed_features, self.detector_label)

        # Create DataFrame with Pfam sequence
        pfam_sequence = util.create_pfam_dataframe_from_features(pfam_features, proteins_by_id)

        # Predict BGC score of each Pfam
        pfam_sequence[self.score_column] = self.model.predict(pfam_sequence)

        # Get average BGC score for each protein
        protein_scores = pfam_sequence.groupby('protein_id', sort=False)[self.score_column].mean()

        # Add score to all Pfam features
        for i, feature in enumerate(pfam_features):
            feature.qualifiers[self.score_column] = ['{:.5f}'.format(pfam_sequence[self.score_column].iloc[i])]

        # Add score to all protein features
        for protein_id, score in protein_scores.items():
            proteins_by_id[protein_id].qualifiers[self.score_column] = ['{:.5f}'.format(score)]

        clusters = []
        active_proteins = []
        gap_proteins = []

        # Create a list of cluster features by merging consecutive proteins with score satisfying given threshold
        # Neighboring clusters within given number of nucleotides/proteins are merged
        for protein in protein_features:
            if self.score_column not in protein.qualifiers:
                # TODO: Should proteins with no Pfam domains also be considered?
                # Current protein did not have any Pfam domains, therefore it has no BGC score, ignore it
                continue
            score = float(protein.qualifiers[self.score_column][0])
            # Inactive protein, add to gap
            if score < self.score_threshold:
                gap_proteins.append(protein)
                # We just changed from active to inactive, add current list of active proteins as a cluster
                if active_proteins:
                    clusters.append(active_proteins)
                    active_proteins = []
            # Active protein
            else:
                # If no cluster is open, check if we should merge with the previous cluster
                if not active_proteins and clusters:
                    prev_cluster_proteins = clusters[-1]
                    prev_end = prev_cluster_proteins[-1].location.end
                    if len(gap_proteins) <= self.merge_max_protein_gap or \
                            (protein.location.start - prev_end) <= self.merge_max_nucl_gap:
                        # Remove previous candidate and continue where it left off
                        clusters = clusters[:-1]
                        active_proteins = prev_cluster_proteins + gap_proteins

                # Add current protein to cluster
                active_proteins.append(protein)
                gap_proteins = []

        # Last protein was active, add list of active proteins as a cluster
        if active_proteins:
            clusters.append(active_proteins)

        # Add detected clusters as features
        record_num_detected = 0
        for cluster_proteins in clusters:
            start = cluster_proteins[0].location.start
            end = cluster_proteins[-1].location.end
            candidate_id = '{}_{}-{}.1'.format(record.id, int(start), int(end))

            if self.min_nucl > 1:
                nucl_length = end - start
                if nucl_length < self.min_nucl:
                    logging.debug('Skipping cluster %s with %s < %s nucleotides', candidate_id, nucl_length, self.min_nucl)
                    continue

            if self.min_proteins > 1:
                num_proteins = len(cluster_proteins)
                if num_proteins < self.min_proteins:
                    logging.debug('Skipping cluster %s with %s < %s proteins', candidate_id, num_proteins, self.min_proteins)
                    continue

            if self.min_domains > 1 or self.min_bio_domains > 0:
                pfam_ids = util.get_pfam_feature_ids(record)
                num_domains = len(pfam_features)
                if num_domains < self.min_domains:
                    logging.debug('Skipping cluster %s with %s < %s protein domains', candidate_id, num_domains, self.min_domains)
                    continue
                num_bio_domains = len(util.filter_biosynthetic_pfam_ids(pfam_ids))
                if num_bio_domains < self.min_bio_domains:
                    logging.debug('Skipping cluster %s with %s < %s known biosynthetic protein domains', candidate_id, num_bio_domains, self.min_bio_domains)
                    continue

            scores = [float(feature.qualifiers[self.score_column][0]) for feature in cluster_proteins]
            location = FeatureLocation(start, end)
            qualifiers = {
                self.score_column: ['{:.5f}'.format(np.mean(scores))],
                'detector': [self.detector_name],
                'detector_label': [self.detector_label],
                'detector_version': [self.model.version],
                'detector_version_timestamp': [self.model.timestamp],
                'product': ['{}_putative'.format(self.detector_name)],
                'bgc_candidate_id': [candidate_id]
            }
            record.features.append(SeqFeature(
                location=location,
                type="cluster",
                qualifiers=qualifiers
            ))
            record_num_detected += 1
            self.num_detected += 1

        # Sort all features by location
        util.sort_record_features(record)

        # Add detector metadata to the record as a structured comment
        if 'structured_comment' not in record.annotations:
            record.annotations['structured_comment'] = {}
        comment_key = util.format_detector_meta_key(self.detector_label)
        record.annotations['structured_comment'][comment_key] = collections.OrderedDict(
            name=self.detector_name,
            label=self.detector_label,
            version=self.model.version,
            version_timestamp=self.model.timestamp,
            detection_timestamp_utc=datetime.utcnow().isoformat(),
            score_threshold=self.score_threshold,
            merge_max_nucl_gap=self.merge_max_nucl_gap,
            merge_max_protein_gap=self.merge_max_protein_gap,
            min_proteins=self.min_proteins,
            min_domains=self.min_domains,
            min_bio_domains=self.min_bio_domains
        )
        logging.info('Detected %s BGCs using %s model in %s', record_num_detected, self.detector_label, record.id)
예제 #4
0
                sys.stderr.write(repr(parts) + "\n")
                raise
            flip = False
            if q_start > q_end:
                flip = not flip
                q_start, q_end = q_end, q_start
            if s_start > s_end:
                flip = not flip
                s_start, s_end = s_end, s_start
            if flip:
                c = colors.Color(0, 0, 1, alpha=0.25)
                b = False
            else:
                c = colors.Color(1, 0, 0, alpha=0.25)
                b = False
            q_feature = q_set.add_feature(SeqFeature(
                FeatureLocation(q_start - 1, q_end)),
                                          color=c,
                                          border=b)
            s_feature = s_set.add_feature(SeqFeature(
                FeatureLocation(s_start - 1, s_end)),
                                          color=c,
                                          border=b)
            gd_diagram.cross_track_links.append(
                CrossLink(q_feature, s_feature, c, b))
            # NOTE: We are using the same colour for all the matches,
            # with transparency. This means overlayed matches will appear darker.
            # It also means the drawing order not very important.
            # Note ACT puts long hits at the back, and colours by hit score

print("Drawing CDS features...")
for f, format in genomes:
예제 #5
0
def convert_gbk(gb_dir,
                gb_out_dir,
                rodeo_output,
                bg_domains,
                max_intergenic_distance=100,
                product_class='thiopeptide'):
    """Convert a common genbank file to the genbank that mimics antiSMASH output.
    
    Adds a feature 'cluster' with information about the class of the product.
    The coordinates of this feature are boundaries of the group of adjacent genes on the same strand that includes RODEO query.
    Marks genes with given domains as biosynthetic.

    Parameters
    ----------
    gb_dir : str
        Directory with input genbank files.
    gb_out_dir : str
        Directory to store the output.
    rodeo_output: RodeoOutput
        RODEO output to use as a reference.
    bg_domains : list
        List of Pfam or TIGRFAMs IDs for domains that are important for your product biosynthesis.
    max_intergenic_distance : int, optional
        Maximum distance (nt) between genes within the biosynthetic gene cluster (default: 100).
    product_class : string, optional
        A putative class of the final product (default: thiopeptide).
    
    Returns
    -------
    bool
        True if successful, False otherwise.
    
    """
    rodeo_output.table_proccessing(bg_domains, max_intergenic_distance)
    operon_border_accs = (rodeo_output.operon_accs[0],
                          rodeo_output.operon_accs[-1])
    biosynthetic_genes = rodeo_output.biosynthetic_genes

    contig_edge = False
    prot_id = rodeo_output.query
    try:
        genbank = SeqIO.parse('%s%s.gbk' % (gb_dir, prot_id), 'genbank')
        for record in genbank:  # Every file is expected to contain only one record

            cluster_coords = OrderedDict([('start', 1), ('end', len(record))])

            for feature in record.features:
                if feature.type == 'CDS':

                    border_check = check_if_border(feature, operon_border_accs)
                    if border_check is not None:
                        cluster_coords[border_check[0]] = border_check[1]

                    if 'protein_id' in feature.qualifiers:
                        if feature.qualifiers['protein_id'][
                                0] in biosynthetic_genes:
                            feature.qualifiers['sec_met'] = [
                                'Kind: biosynthetic'
                            ]

            start, end = cluster_coords.values()
            cluster_location = FeatureLocation(start, end)
            cluster_qualifiers = OrderedDict([('contig_edge',
                                               str(contig_edge)),
                                              ('product', product_class)])
            cluster = SeqFeature(location=cluster_location,
                                 type='cluster',
                                 qualifiers=cluster_qualifiers)
            record.features = [cluster] + record.features

            SeqIO.write(record, '%s%s.gbk' % (gb_out_dir, prot_id), 'genbank')
            return True

    except Exception as e:
        print e
        return False
예제 #6
0
def mockup(features_list,
           write=False,
           pagesize="A4",
           scale_fontsize=3,
           label_size=2,
           greytrack_fontsize=7,
           x=0.05,
           y=0.01,
           track_size=0.2,
           track_names="",
           scale_ticks=False,
           format="linear",
           total_len=3000):
    colors_cycle = [
        colors.orchid, colors.cornflower, colors.lightseagreen,
        colors.cornflower, colors.salmon
    ]
    colors_cycle = cycle(colors_cycle)

    gdd = GenomeDiagram.Diagram('Construct Diagram',
                                x=x,
                                y=y,
                                track_size=track_size)

    for ix, track_info in enumerate(features_list):
        track_len = 0
        for i in track_info:
            track_len += i[1]
        track, features = new_track(gdd,
                                    " " + track_names[ix],
                                    smalltick=10,
                                    scale_fontsize=scale_fontsize,
                                    greytrack_fontsize=greytrack_fontsize,
                                    scale_ticks=scale_ticks,
                                    end=track_len)
        feature_start = 0
        for feature_info in track_info:
            if feature_info[0] == "skip":
                feature_start += feature_info[1]
                continue
            feature = SeqFeature(FeatureLocation(
                feature_start, feature_start + feature_info[1]),
                                 strand=feature_info[2])
            if feature_info[0] == "LoxP":
                feature_color = colors.yellow
            elif feature_info[0] == "STOP":
                feature_color = colors.red
            elif feature_info[0] == "Restriction":
                feature_color = colors.chartreuse
            else:
                feature_color = colors_cycle.next()
            features.add_feature(feature,
                                 name=feature_info[0],
                                 label=True,
                                 color=feature_color,
                                 label_size=label_size,
                                 label_color=feature_color,
                                 label_angle=30,
                                 sigil=feature_info[3],
                                 arrowshaft_height=1)
            feature_start += feature_info[1]

    gdd.draw(format=format,
             pagesize=pagesize,
             fragments=1,
             start=0,
             end=total_len)
    if write:
        gdd.write("/home/chymera/src/AutoTransGeno/output/test.pdf", "PDF")

    return gdd
예제 #7
0
    def test_reverse_complement_seq(self):
        s = SeqRecord(
            Seq("ACTG"),
            id="TestID",
            name="TestName",
            description="TestDescription",
            dbxrefs=["TestDbxrefs"],
            features=[SeqFeature(FeatureLocation(0, 3), type="Site")],
            annotations={"organism": "bombyx"},
            letter_annotations={"test": "abcd"},
        )
        rc = s.reverse_complement(
            id=True,
            name=True,
            description=True,
            dbxrefs=True,
            features=True,
            annotations=True,
            letter_annotations=True,
        )

        self.assertEqual("CAGT", str(rc.seq))
        self.assertEqual("TestID", rc.id)
        self.assertEqual("TestID", s.reverse_complement(id="TestID").id)

        self.assertEqual("TestName", rc.name)
        self.assertEqual("TestName", s.reverse_complement(name="TestName").name)

        self.assertEqual("TestDescription", rc.description)
        self.assertEqual(
            "TestDescription",
            s.reverse_complement(description="TestDescription").description,
        )

        self.assertEqual(["TestDbxrefs"], rc.dbxrefs)
        self.assertEqual(
            ["TestDbxrefs"], s.reverse_complement(dbxrefs=["TestDbxrefs"]).dbxrefs
        )

        self.assertEqual(
            "[SeqFeature(FeatureLocation(ExactPosition(1), ExactPosition(4)), type='Site')]",
            repr(rc.features),
        )
        rc2 = s.reverse_complement(
            features=[SeqFeature(FeatureLocation(1, 4), type="Site")]
        )
        self.assertEqual(
            "[SeqFeature(FeatureLocation(ExactPosition(1), ExactPosition(4)), type='Site')]",
            repr(rc2.features),
        )

        self.assertEqual({"organism": "bombyx"}, rc.annotations)
        self.assertEqual(
            {"organism": "bombyx"},
            s.reverse_complement(annotations={"organism": "bombyx"}).annotations,
        )

        self.assertEqual({"test": "dcba"}, rc.letter_annotations)
        self.assertEqual(
            {"test": "abcd"},
            s.reverse_complement(
                letter_annotations={"test": "abcd"}
            ).letter_annotations,
        )
예제 #8
0
    i += 1

genes_track = GenomeDiagram.Track('genes', greytrack=False, scale=False)

genes_track.add_set(feature_set1)
genes_track.add_set(feature_set2)

#%%
from Bio.SeqFeature import SeqFeature, FeatureLocation

snv_df = pd.read_csv(tab_dir + '/SNV_HUMAN_YFV_RESULTS.csv')
snv_series = snv_df.iloc[:, 2]
feature_set_SNV = GenomeDiagram.FeatureSet()

for position in snv_series:
    snv = SeqFeature(FeatureLocation(position, position), strand=+1)
    feature_set_SNV.add_feature(snv, color='red', strand=None)

SNV_track = GenomeDiagram.Track('SNV',
                                greytrack=False,
                                scale=True,
                                scale_format='SInt',
                                scale_fontsize=10,
                                scale_fontangle=90,
                                scale_largetick_interval=5000,
                                scale_smalltick_interval=1000,
                                scale_largeticks=0.5,
                                scale_smallticks=0.2)

SNV_track.add_set(feature_set_SNV)
예제 #9
0
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False):
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    blast_records = NCBIXML.parse(blastxml)
    records = []
    for record in blast_records:
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        match_type = {  # Currently we can only handle BLASTN, BLASTP
            'BLASTN': 'nucleotide_match',
            'BLASTP': 'protein_match',
        }.get(record.application, 'match')

        rec = SeqRecord(Seq("ACTG"), id=record.query)
        for hit in record.alignments:
            for hsp in hit.hsps:
                qualifiers = {
                    "source": "blast",
                    "score": hsp.expect,
                    "accession": hit.accession,
                    "hit_id": hit.hit_id,
                    "length": hit.length,
                    "hit_titles": hit.title.split(' >')
                }
                desc = hit.title.split(' >')[0]
                qualifiers['description'] = desc[desc.index(' '):]

                # This required a fair bit of sketching out/match to figure out
                # the first time.
                #
                # the match_start location must account for queries and
                # subjecst that start at locations other than 1
                parent_match_start = hsp.query_start - hsp.sbjct_start
                # The end is the start + hit.length because the match itself
                # may be longer than the parent feature, so we use the supplied
                # subject/hit length to calculate the real ending of the target
                # protein.
                parent_match_end = hsp.query_start + hit.length + hsp.query.count(
                    '-')

                # However, if the user requests that we trim the feature, then
                # we need to cut the ``match`` start to 0 to match the parent feature.
                # We'll also need to cut the end to match the query's end. It (maybe)
                # should be the feature end? But we don't have access to that data, so
                # We settle for this.
                if trim:
                    if parent_match_start < 1:
                        parent_match_start = 0

                if trim or trim_end:
                    if parent_match_end > hsp.query_end:
                        parent_match_end = hsp.query_end + 1

                # The ``match`` feature will hold one or more ``match_part``s
                top_feature = SeqFeature(FeatureLocation(
                    parent_match_start, parent_match_end),
                                         type=match_type,
                                         strand=0,
                                         qualifiers=qualifiers)

                # Unlike the parent feature, ``match_part``s have sources.
                part_qualifiers = {
                    "source": "blast",
                }
                top_feature.sub_features = []
                for start, end, cigar in generate_parts(hsp.query,
                                                        hsp.match,
                                                        hsp.sbjct,
                                                        ignore_under=min_gap):
                    part_qualifiers['Gap'] = cigar
                    part_qualifiers['ID'] = hit.hit_id

                    if trim:
                        # If trimming, then we start relative to the
                        # match's start
                        match_part_start = parent_match_start + start
                    else:
                        # Otherwise, we have to account for the subject start's location
                        match_part_start = parent_match_start + hsp.sbjct_start + start - 1

                    # We used to use hsp.align_length here, but that includes
                    # gaps in the parent sequence
                    #
                    # Furthermore align_length will give calculation errors in weird places
                    # So we just use (end-start) for simplicity
                    match_part_end = match_part_start + (end - start)

                    top_feature.sub_features.append(
                        SeqFeature(FeatureLocation(match_part_start,
                                                   match_part_end),
                                   type="match_part",
                                   strand=0,
                                   qualifiers=copy.deepcopy(part_qualifiers)))

                rec.features.append(top_feature)
        rec.annotations = {}
        records.append(rec)
    return records
예제 #10
0
파일: TermGen.py 프로젝트: kaizu/SigmoID
                        feature].location.start and strand != gene_list[
                            feature].strand:
                    break

        if gene != True:
            qualifiers['gene'] = gene
            qualifiers['note'] = 'TransTerm HP conf=%s, tail_score=%s' % (
                conf, tail_score)
        else:
            qualifiers['note'] = 'TransTerm HP conf=%s, tail_score=%s' % (
                conf, tail_score)
        qualifiers['regulatory_class'] = 'terminator'
        qualifiers['TermGen_check'] = ['Checked!']
        feature_location = FeatureLocation(start, end)
        my_feature = SeqFeature(location=feature_location,
                                type='regulatory',
                                strand=strand,
                                qualifiers=qualifiers)

        # adding terminators to genbank features list
        for i in reversed(xrange(len(record.features))):
            if record.features[i].location.start < start:
                record.features.insert(i + 1, my_feature)
                break

    # editing features to add U-tail
    new_features_list = []
    for feature in record.features:
        if feature.type == 'regulatory' and feature.qualifiers.has_key(
                'TermGen_check'):
            u_tail = 0
            tail = ''
예제 #11
0
def prepare_visualization(options, seq_record):

    # Check, whether (Sub)ClusterBlast data is encoded in source feature
    sourceFeatures = utils.get_all_features_of_type(seq_record, 'source')
    if len(sourceFeatures) == 0:
        loc = FeatureLocation(0, len(seq_record.seq))
        source_feature = SeqFeature(loc, type="source")
        seq_record.features.append(source_feature)
        sourceFeatures = utils.get_all_features_of_type(seq_record, 'source')

    if 'extrarecord' in options:
        if options.extrarecord.has_key(seq_record.id):
            # As there is only one source feature per record we just can take the first one without cycling through all features
            for key in options.extrarecord[seq_record.id].extradata.keys():
                if key == 'ClusterBlastData':
                    logging.debug(
                        "prepare_visualization: Found ClusterBlastData storage object"
                    )
                    options.clusterblast = True

                    clusterBlastResults = options.extrarecord[
                        seq_record.id].extradata[key]

                    seq_record.internalhomologygroupsdict = clusterBlastResults.internalhomologygroupsdict
                    seq_record.known_compound_dict = clusterBlastResults.known_compound_dict
                    seq_record.nrhitgeneclusters = clusterBlastResults.nrhitgeneclusters
                    seq_record.qgeneclusterdata = clusterBlastResults.qgeneclusterdata
                    seq_record.queryclusterdata = clusterBlastResults.queryclusterdata
                    seq_record.pubchem_dict = clusterBlastResults.pubchem_dict
                    seq_record.pubmed_dict = clusterBlastResults.pubmed_dict

                elif key == 'SubClusterBlastData':
                    logging.debug(
                        "prepare_visualization: Found SubClusterBlastData storage object"
                    )
                    options.subclusterblast = True

                    subclusterBlastResults = options.extrarecord[
                        seq_record.id].extradata[key]
                    seq_record.internalhomologygroupsdict = subclusterBlastResults.internalhomologygroupsdict
                    seq_record.sc_nrhitgeneclusters = subclusterBlastResults.sc_nrhitgeneclusters
                    #        seq_record.sc_qgeneclusterdata = subclusterBlastResults.sc_qgeneclusterdata
                    seq_record.sc_queryclusterdata = subclusterBlastResults.sc_queryclusterdata
                    seq_record.pubchem_dict = subclusterBlastResults.pubchem_dict
                    seq_record.pubmed_dict = subclusterBlastResults.pubmed_dict

                elif key == 'KnownClusterBlastData':
                    logging.debug(
                        "prepare_visualization: Found KnownClusterBlastData storage object"
                    )
                    options.knownclusterblast = True

                    knownclusterBlastResults = options.extrarecord[
                        seq_record.id].extradata[key]
                    seq_record.internalhomologygroupsdict = knownclusterBlastResults.internalhomologygroupsdict
                    seq_record.kc_nrhitgeneclusters = knownclusterBlastResults.kc_nrhitgeneclusters
                    #        seq_record.kc_qgeneclusterdata = knownclusterBlastResults.sc_qgeneclusterdata
                    seq_record.kc_queryclusterdata = knownclusterBlastResults.kc_queryclusterdata
                    seq_record.pubchem_dict = knownclusterBlastResults.pubchem_dict
                    seq_record.pubmed_dict = knownclusterBlastResults.pubmed_dict


#                 elif key == 'MetabolicModelDataObj':
#                     pass
#                 else:
#                     logging.warn('Found key %s in options.clusterblastdata which does not match the hard coded choices!' % key)
#     #
#     load_pubmed_pubchem_links(seq_record)
#     if options.clusterblast:
#         load_clusterblast_outputdata(seq_record, options)
#     if options.subclusterblast:
#         load_subclusterblast_outputdata(seq_record, options)
#     if options.knownclusterblast:
#         load_knownclusterblast_outputdata(seq_record, options)
#     load_genecluster_info(seq_record, options)

        load_genecluster_info(seq_record, options)
예제 #12
0
                    if label in f.qualifiers.get("label", []))

        def get_features_from_note(note):
            return (f for f in features
                    if note in f.qualifiers.get("note", []))

        # Add direct BsaI site
        features.append(
            SeqFeature(
                type="protein_bind",
                qualifiers={
                    # "label": ["BsaI"],
                    "bound_moiety": ["BsaI"],
                    "note": ["color: #ff0000; direction: RIGHT"],
                    # "note": ["This forward directional feature has 2 segments:\n"
                    #          "1: 3 ..  8 / #ff0000\n"
                    #          "2: 10 .. 13 / #ff0000\n"]
                },
                location=CompoundLocation([
                    FeatureLocation(2, 8, strand=1),
                    FeatureLocation(9, 13, strand=1)
                ]),
            ))

        # Add reversed BsaI site
        BsaI_site_r = Seq(BsaI.site).reverse_complement()
        pos = (gba.seq + gba.seq).find(BsaI_site_r)
        features.append(
            SeqFeature(
                type="protein_bind",
                qualifiers={
예제 #13
0
        for record in SeqIO.parse(fh, "fasta"):
            sdf = pd.DataFrame({
                'length': len(record),
                'ID': record.id
            },
                               index=[record.id])
            df = df.append(sdf)

    df.length = df.length.astype(int)
    max_len = df.length.max()

    #scale用ラダー染色体を書く

    lad_rec = SeqRecord(RandomSeq(scale_max),
                        "ladder_" + str(scale_max) + " bp")
    lad_feature = SeqFeature()
    lad_feature_s = []

    for i in range(1, scale_split, 2):
        region_unit = int(scale_max / scale_split)
        tmp_featur = SeqFeature(FeatureLocation(region_unit * i,
                                                region_unit * (i + 1)),
                                type="gene",
                                strand=1)
        tmp_start = region_unit * i
        tmp_featur.qualifiers["locus_tag"] = str(tmp_start)
        lad_feature_s.append(tmp_featur)

    lad_rec.features = lad_feature_s

    cur_chromosome = BasicChromosome.Chromosome(lad_rec.id)
예제 #14
0
# We add dummy features to the tracks for each cross-link BEFORE we add the
# arrow features for the genes. This ensures the genes appear on top:
for X, Y, X_vs_Y in [("NC_002703", "AF323668", A_vs_B),
                     ("AF323668", "NC_003212", B_vs_C)]:
    features_X = records[X].features
    features_Y = records[Y].features
    set_X = feature_sets[X]
    set_Y = feature_sets[Y]
    for score, x, y in X_vs_Y:
        color = colors.linearlyInterpolatedColor(colors.white,
                                                 colors.firebrick, 0, 100,
                                                 score)
        border = colors.lightgrey
        f_x = get_feature(features_X, x)
        F_x = set_X.add_feature(SeqFeature(
            FeatureLocation(f_x.location.start, f_x.location.end, strand=0)),
                                color=color,
                                border=border)
        f_y = get_feature(features_Y, y)
        F_y = set_Y.add_feature(SeqFeature(
            FeatureLocation(f_y.location.start, f_y.location.end, strand=0)),
                                color=color,
                                border=border)
        gd_diagram.cross_track_links.append(CrossLink(F_x, F_y, color, border))

for record, gene_colors in zip([A_rec, B_rec, C_rec],
                               [A_colors, B_colors, C_colors]):
    gd_feature_set = feature_sets[record.name]

    i = 0
    for feature in record.features:
예제 #15
0
    def json2seqrecord(self, json_record):
        uid = json_record["primaryAccession"]
        if "recommendedName" in json_record["proteinDescription"]:
            desc = json_record["proteinDescription"]["recommendedName"]["fullName"]["value"]
        else:
            desc = json_record["proteinDescription"]["submissionNames"][0]["fullName"]["value"]
        ecs = ([x["value"] for x in json_record["proteinDescription"]["recommendedName"]["ecNumbers"]]
               if ("recommendedName" in json_record["proteinDescription"] and
                   "ecNumbers" in json_record["proteinDescription"]["recommendedName"]) else [])

        if "contains" in json_record["proteinDescription"]:
            for pd in json_record["proteinDescription"]["contains"]:

                if "ecNumbers" in pd["recommendedName"]:
                    ecs += [x["value"] for x in pd["recommendedName"]["ecNumbers"]]

        r = SeqRecord(id=uid, name="", description=desc, seq=Seq(json_record["sequence"]["value"]))

        if "genes" in json_record:
            for gene in json_record["genes"]:
                if "geneName" in gene:
                    val = gene["geneName"]["value"]
                    dbx = "UnipGene:" + val
                    r.dbxrefs.append(dbx)
                if "synonyms" in json_record["genes"]:
                    for syn in json_record["genes"]["synonyms"]:
                        val = syn["value"]
                        dbx = "UnipGene:" + val
                        r.dbxrefs.append(dbx)

        if "alternativeNames" in json_record["proteinDescription"]:
            for an in json_record["proteinDescription"]["alternativeNames"]:
                if "shortNames" in an:
                    for x in an["shortNames"]:
                        dbx = "UnipName:" + x["value"]
                        r.dbxrefs.append(dbx)

        for ref in json_record["uniProtKBCrossReferences"]:
            dbx = ref["database"] + ":" + ref["id"] if (ref["database"] + ":") not in ref["id"] else ref["id"]
            r.dbxrefs.append(dbx)
            self.dbx_dict[dbx] = {x["key"]: x["value"] for x in ref["properties"]}
            if ref["database"] == "GO":
                gt = self.dbx_dict[dbx]["GoTerm"]
                self.dbx_dict[dbx]["GoTerm"] = ":".join(gt.split(":")[1:])

                self.dbx_dict[dbx]["database"] = gt.split(":")[0]
                if self.dbx_dict[dbx]["database"] == "b":
                    self.dbx_dict[dbx]["database"] = "biological_process"
                if self.dbx_dict[dbx]["database"] == "c":
                    self.dbx_dict[dbx]["database"] = "molecular_function"
                if self.dbx_dict[dbx]["database"] == "f":
                    self.dbx_dict[dbx]["database"] = "cellular_component"

        for ref in (json_record["secondaryAccessions"] if "secondaryAccessions" in json_record else []
                   ) + [json_record["uniProtkbId"], json_record["primaryAccession"]]:
            dbx = "UnipAcc:" + ref.replace(" ", "_")
            r.dbxrefs.append(dbx)

        for ref in ecs:
            dbx = "EC:" + ref
            r.dbxrefs.append(dbx)
            self.dbx_dict[dbx] = [["description", desc]]

        r.dbxrefs = list(set(r.dbxrefs))

        if "features" in json_record:
            for f in json_record["features"]:
                l = FeatureLocation(start=f["location"]["start"]["value"],
                                    end=f["location"]["end"]["value"])

                qual = {"description": f["description"]} if f["description"].replace("-", "").strip() else {}
                if "featureId" in f:
                    qual["featureId"] = f["featureId"]
                seqf = SeqFeature(type=f["type"], location=l, qualifiers=qual)
                r.features.append(seqf)
        return r
예제 #16
0
    def check_simple_tRNA(self, filename, use_seqfeatures=False):
        f1 = [(111889, 111961, -1, 'G01270'), (306383, 306456, 1, 'G01870'), (309274, 309347, -1, 'G01890'), (515493, 515566, 1, 'G02480'), (552639, 552711, 1, 'G02600'), (604401, 604474, 1, 'G02760'), (877648, 877720, 1, 'G03515'), (892513, 892585, 1, 'G03570'), (909809, 909882, -1, 'G03640'), (1159021, 1159092, 1, 'G04320'), (1324921, 1324959, 1, 'G04720'), (1583770, 1583844, -1, 'G05390'), (1817398, 1817470, 1, 'G05980'), (1978082, 1978156, 1, 'G06480'), (2025354, 2025427, 1, 'G06610'), (2107396, 2107467, -1, 'G06860'), (2111146, 2111217, -1, 'G06880'), (2177883, 2177957, 1, 'G07100'), (2334818, 2334891, 1, 'G07580'), (2406830, 2406902, -1, 'G07760'), (2588521, 2588593, 1, 'G08240'), (2846538, 2846611, -1, 'G08870'), (2879305, 2879377, 1, 'G08950'), (2939418, 2939490, 1, 'G09110'), (3431185, 3431257, -1, 'G10440'), (3676606, 3676644, 1, 'G11010'), (3678774, 3678848, -1, 'G11030'), (3881528, 3881608, 1, 'G11550'), (3914628, 3914700, -1, 'G11640'), (4266985, 4267059, -1, 'G12510'), (4285884, 4285956, -1, 'G12590'), (4440211, 4440284, 1, 'G13010'), (4522705, 4522779, -1, 'G13240'), (4709631, 4709703, 1, 'G13720'), (4741995, 4742068, 1, 'G13840'), (4743091, 4743164, 1, 'G13850'), (5189681, 5189755, -1, 'G15090'), (5309641, 5309713, -1, 'G15450'), (5380901, 5380983, 1, 'G15650'), (5518055, 5518128, -1, 'G16100'), (5619464, 5619537, -1, 'G16450'), (6038749, 6038831, 1, 'G17570'), (6075812, 6075884, 1, 'G17660'), (6075937, 6076011, -1, 'G17670'), (6345756, 6345828, 1, 'G18430'), (6488645, 6488726, 1, 'G18820'), (6948850, 6948934, -1, 'G20040'), (6995272, 6995344, -1, 'G20170'), (7004504, 7004576, 1, 'G20210'), (7016506, 7016579, 1, 'G20250'), (7082657, 7082729, 1, 'G20420'), (7242749, 7242821, -1, 'G20820'), (7499721, 7499793, -1, 'G21420'), (7656108, 7656180, -1, 'G21800'), (7884405, 7884443, -1, 'G22320'), (8520278, 8520352, -1, 'G24080'), (9143796, 9143870, 1, 'G26430'), (9158169, 9158242, 1, 'G26490'), (10089422, 10089494, 1, 'G28720'), (10089883, 10089955, 1, 'G28730'), (10090353, 10090425, 1, 'G28740'), (10090754, 10090826, 1, 'G28750'), (10092310, 10092382, 1, 'G28770'), (10092786, 10092858, 1, 'G28780'), (10093294, 10093366, 1, 'G28790'), (10093731, 10093803, 1, 'G28800'), (10094158, 10094230, 1, 'G28810'), (10096936, 10097008, 1, 'G28820'), (10097099, 10097171, 1, 'G28830'), (10097703, 10097775, 1, 'G28840'), (10098638, 10098710, 1, 'G28850'), (10099064, 10099136, 1, 'G28860'), (10099410, 10099482, 1, 'G28870'), (10099812, 10099884, 1, 'G28880'), (10100258, 10100330, 1, 'G28890'), (10101013, 10101085, 1, 'G28900'), (10101585, 10101657, 1, 'G28910'), (10101978, 10102050, 1, 'G28920'), (10106075, 10106147, 1, 'G28930'), (10106513, 10106585, 1, 'G28940'), (10106883, 10106955, 1, 'G28950'), (10107634, 10107706, 1, 'G28970'), (10108374, 10108446, 1, 'G28980'), (10108695, 10108767, 1, 'G28990'), (10207291, 10207364, -1, 'G29210'), (10756703, 10756776, 1, 'G30430'), (10963553, 10963627, -1, 'G30830'), (11104093, 11104167, 1, 'G31110'), (11797227, 11797265, -1, 'G32620'), (12097258, 12097327, -1, 'G33370'), (13687637, 13687710, 1, 'G36350'), (15733055, 15733127, -1, 'G42120'), (16588144, 16588216, -1, 'G43820'), (17159046, 17159118, 1, 'G45234'), (17159799, 17159871, 1, 'G45236'), (17160970, 17161042, 1, 'G45238'), (17161418, 17161490, 1, 'G45240'), (17162967, 17163039, 1, 'G45242'), (17163408, 17163480, 1, 'G45244'), (17164461, 17164533, 1, 'G45246'), (17735509, 17735582, 1, 'G48080'), (18139265, 18139337, -1, 'G49020'), (18234146, 18234220, -1, 'G49280'), (18312570, 18312607, 1, 'G49460'), (18391469, 18391542, 1, 'G49690'), (18556666, 18556746, 1, 'G50070'), (18561567, 18561647, 1, 'G50100'), (19428223, 19428297, 1, 'G52170'), (19502087, 19502161, -1, 'G52350'), (19688850, 19688887, -1, 'G52860'), (19851640, 19851714, 1, 'G53220'), (19929506, 19929578, -1, 'G53410'), (20416594, 20416667, -1, 'G54670'), (20794976, 20795058, 1, 'G55625'), (21272451, 21272533, 1, 'G56730'), (21272786, 21272823, 1, 'G56740'), (21273216, 21273253, 1, 'G56750'), (21273960, 21274042, 1, 'G56760'), (21274295, 21274332, 1, 'G56770'), (21274725, 21274762, 1, 'G56780'), (21275469, 21275551, 1, 'G56790'), (21275804, 21275841, 1, 'G56800'), (21276234, 21276271, 1, 'G56810'), (21276978, 21277060, 1, 'G56820'), (21277313, 21277350, 1, 'G56830'), (21277743, 21277780, 1, 'G56840'), (21278487, 21278569, 1, 'G56850'), (21278822, 21278859, 1, 'G56860'), (21279273, 21279310, 1, 'G56870'), (21280016, 21280098, 1, 'G56880'), (21280351, 21280388, 1, 'G56890'), (21280781, 21280818, 1, 'G56900'), (21281525, 21281607, 1, 'G56910'), (21281860, 21281897, 1, 'G56920'), (21282311, 21282348, 1, 'G56930'), (21283054, 21283136, 1, 'G56940'), (21283384, 21283421, 1, 'G56950'), (21283842, 21283879, 1, 'G56960'), (21284586, 21284668, 1, 'G56970'), (21284916, 21284953, 1, 'G56980'), (21285374, 21285411, 1, 'G56990'), (21286118, 21286200, 1, 'G57000'), (21286448, 21286485, 1, 'G57010'), (21286906, 21286943, 1, 'G57020'), (21287650, 21287732, 1, 'G57030'), (21287980, 21288017, 1, 'G57040'), (21288438, 21288475, 1, 'G57050'), (21289183, 21289265, 1, 'G57060'), (21289513, 21289550, 1, 'G57070'), (21289970, 21290007, 1, 'G57080'), (21290714, 21290796, 1, 'G57090'), (21291044, 21291081, 1, 'G57100'), (21291501, 21291538, 1, 'G57110'), (21292245, 21292327, 1, 'G57120'), (21292574, 21292611, 1, 'G57130'), (21293032, 21293069, 1, 'G57140'), (21293776, 21293858, 1, 'G57150'), (21294109, 21294146, 1, 'G57160'), (21294567, 21294604, 1, 'G57170'), (21295125, 21295207, 1, 'G57180'), (21295455, 21295492, 1, 'G57190'), (21295912, 21295949, 1, 'G57200'), (21296656, 21296738, 1, 'G57210'), (21296989, 21297026, 1, 'G57220'), (21297447, 21297484, 1, 'G57230'), (21298005, 21298087, 1, 'G57240'), (21298335, 21298372, 1, 'G57250'), (21298792, 21298829, 1, 'G57260'), (21299536, 21299618, 1, 'G57270'), (21299869, 21299906, 1, 'G57280'), (21300327, 21300364, 1, 'G57290'), (21300885, 21300967, 1, 'G57300'), (21301215, 21301252, 1, 'G57310'), (21301673, 21301710, 1, 'G57320'), (21302417, 21302499, 1, 'G57330'), (21302750, 21302787, 1, 'G57340'), (21303208, 21303245, 1, 'G57350'), (21303766, 21303848, 1, 'G57360'), (21304096, 21304133, 1, 'G57370'), (21304554, 21304591, 1, 'G57380'), (21305298, 21305380, 1, 'G57390'), (21305631, 21305668, 1, 'G57400'), (21306089, 21306126, 1, 'G57410'), (21306647, 21306729, 1, 'G57420'), (21306981, 21307018, 1, 'G57430'), (21307441, 21307478, 1, 'G57440'), (21308184, 21308268, 1, 'G57450'), (21308520, 21308557, 1, 'G57460'), (21308975, 21309012, 1, 'G57470'), (21309719, 21309801, 1, 'G57480'), (21310053, 21310090, 1, 'G57490'), (21310513, 21310550, 1, 'G57500'), (21311256, 21311340, 1, 'G57510'), (21311592, 21311629, 1, 'G57520'), (21312051, 21312088, 1, 'G57530'), (21377983, 21378054, -1, 'G57710'), (21887507, 21887589, -1, 'G59570'), (22044276, 22044348, -1, 'G59880'), (22317078, 22317149, -1, 'G60580'), (22398301, 22398372, -1, 'G60820'), (22401256, 22401327, -1, 'G60840'), (22431831, 22431902, 1, 'G60910'), (22481437, 22481511, -1, 'G61020'), (22870422, 22870494, -1, 'G61880'), (22890754, 22890834, 1, 'G61910'), (23562849, 23562921, -1, 'G63510'), (23671147, 23671219, -1, 'G63790'), (23806215, 23806299, 1, 'G64120'), (23936799, 23936872, 1, 'G64420'), (24490654, 24490736, -1, 'G65830'), (25833316, 25833388, 1, 'G68770'), (25890198, 25890272, 1, 'G68860'), (25931858, 25931931, 1, 'G68950'), (25935739, 25935812, -1, 'G68970'), (25944826, 25944898, 1, 'G69000'), (25993392, 25993466, 1, 'G69130'), (26053140, 26053214, 1, 'G69300'), (26385816, 26385888, -1, 'G70050'), (26977050, 26977121, 1, 'G71700'), (27397046, 27397128, 1, 'G72780'), (27792643, 27792715, 1, 'G73900'), (28024043, 28024124, -1, 'G74570'), (28031620, 28031701, 1, 'G74610'), (28188192, 28188264, 1, 'G75070'), (28377149, 28377222, -1, 'G75570'), (28411644, 28411717, 1, 'G75650'), (28444549, 28444621, 1, 'G75740'), (28523645, 28523717, -1, 'G75970'), (28531427, 28531499, 1, 'G76000'), (28639585, 28639667, 1, 'G76330'), (28952447, 28952519, -1, 'G77040'), (29007098, 29007180, -1, 'G77190'), (29147983, 29148055, -1, 'G77560'), (29448865, 29448903, -1, 'G78250'), (29809015, 29809088, 1, 'G79240'), (29838009, 29838081, 1, 'G79290'), (29838610, 29838682, 1, 'G79300'), (30088888, 30088962, -1, 'G79980'), (30178905, 30178977, -1, 'G80250'), (30242675, 30242757, 1, 'G80430')]
        f2 = [(102063, 102137, 1, 'G01160'), (706794, 706867, 1, 'G02600'), (846853, 846926, -1, 'G02900'), (1054714, 1054787, -1, 'G03490'), (1113980, 1114052, -1, 'G03660'), (1123386, 1123458, -1, 'G03700'), (1154381, 1154454, 1, 'G03790'), (3239653, 3239725, -1, 'G07742'), (3255828, 3255902, -1, 'G07743'), (3268803, 3268883, 1, 'G07745'), (3276436, 3276508, 1, 'G07746'), (3280859, 3280933, 1, 'G07748'), (3290962, 3291034, 1, 'G07778'), (3303240, 3303312, -1, 'G07752'), (3303350, 3303425, -1, 'G07753'), (3303781, 3303819, -1, 'G07754'), (3328666, 3328739, -1, 'G07755'), (3332674, 3332756, 1, 'G07792'), (3369350, 3369437, 1, 'G07793'), (3383400, 3383474, -1, 'G07794'), (3444359, 3444431, -1, 'G07756'), (3452973, 3453060, 1, 'G07757'), (3462074, 3462148, 1, 'G07758'), (3494378, 3494416, 1, 'G07759'), (3494772, 3494847, 1, 'G07761'), (3495008, 3495083, 1, 'G07762'), (3495438, 3495509, 1, 'G07763'), (3496436, 3496508, 1, 'G07764'), (3497354, 3497437, 1, 'G07765'), (3503518, 3503605, 1, 'G07766'), (6953924, 6953961, -1, 'G15950'), (7046175, 7046247, 1, 'G16240'), (7749793, 7749867, 1, 'G17810'), (7962758, 7962832, -1, 'G18310'), (9144435, 9144507, 1, 'G21360'), (9241319, 9241356, -1, 'G21570'), (9273888, 9273969, -1, 'G21670'), (9277742, 9277814, -1, 'G21700'), (9291113, 9291185, 1, 'G21760'), (9400749, 9400823, 1, 'G22110'), (9456888, 9456962, -1, 'G22220'), (9472660, 9472733, -1, 'G22280'), (9509359, 9509433, 1, 'G22380'), (9598106, 9598179, 1, 'G22580'), (9810296, 9810368, -1, 'G23020'), (10066525, 10066597, -1, 'G23650'), (10380655, 10380728, 1, 'G24380'), (10820917, 10820990, 1, 'G25400'), (11122756, 11122837, -1, 'G26090'), (11781928, 11782000, -1, 'G27560'), (11871230, 11871302, -1, 'G27850'), (12336079, 12336151, 1, 'G28730'), (12346827, 12346899, 1, 'G28770'), (12478849, 12478921, -1, 'G29030'), (12645232, 12645305, -1, 'G29520'), (12888667, 12888738, 1, 'G30180'), (12889810, 12889881, 1, 'G30190'), (12983024, 12983095, -1, 'G30450'), (13144312, 13144385, -1, 'G30850'), (13658350, 13658425, 1, 'G32110'), (14054465, 14054503, -1, 'G33140'), (14250206, 14250278, 1, 'G33650'), (14251774, 14251846, 1, 'G33660'), (14357464, 14357536, 1, 'G33890'), (14358437, 14358509, 1, 'G33900'), (14359269, 14359341, 1, 'G33910'), (14360221, 14360293, 1, 'G33920'), (14360734, 14360806, 1, 'G33930'), (14361176, 14361248, 1, 'G33940'), (14362215, 14362287, 1, 'G33950'), (14363133, 14363205, 1, 'G33960'), (14363599, 14363671, 1, 'G33970'), (14750553, 14750627, -1, 'G34950'), (14757142, 14757213, 1, 'G34985'), (14847685, 14847723, 1, 'G35220'), (15175940, 15176014, 1, 'G36140'), (15176656, 15176736, 1, 'G36150'), (15215480, 15215517, -1, 'G36280'), (15327312, 15327395, 1, 'G36510'), (15327463, 15327546, -1, 'G36520'), (15353238, 15353311, 1, 'G36600'), (15477287, 15477324, -1, 'G36860'), (15923894, 15923967, 1, 'G38030'), (16525641, 16525713, -1, 'G39600'), (16525846, 16525918, 1, 'G39610'), (16646857, 16646929, -1, 'G39860'), (17545780, 17545862, -1, 'G42020'), (17667855, 17667926, 1, 'G42420'), (17880766, 17880839, 1, 'G42970'), (18002649, 18002721, -1, 'G43300'), (18317052, 18317134, -1, 'G44320'), (18576985, 18577058, 1, 'G45020'), (18710751, 18710824, 1, 'G45390'), (18963713, 18963786, 1, 'G46120'), (19351496, 19351569, 1, 'G47100'), (19566924, 19566995, -1, 'G47740')]
        f3 = [(259640, 259712, 1, 'G01705'), (469666, 469740, 1, 'G02315'), (476808, 476880, 1, 'G02335'), (586092, 586174, 1, 'G02715'), (981975, 982047, 1, 'G03845'), (984105, 984177, 1, 'G03852'), (1220234, 1220307, 1, 'G04525'), (1601343, 1601415, -1, 'G05525'), (1707743, 1707815, -1, 'G05755'), (1738796, 1738870, 1, 'G05835'), (1843329, 1843400, -1, 'G06105'), (1920038, 1920110, -1, 'G06335'), (2104961, 2105033, -1, 'G06665'), (2222251, 2222324, 1, 'G07025'), (2232470, 2232506, -1, 'G07055'), (2253680, 2253762, -1, 'G07115'), (2285607, 2285679, 1, 'G07185'), (2918418, 2918492, -1, 'G09505'), (2944616, 2944698, 1, 'G09585'), (2945700, 2945782, -1, 'G09595'), (3090548, 3090631, 1, 'G10015'), (3096220, 3096293, 1, 'G10035'), (3238371, 3238407, -1, 'G10415'), (3535151, 3535224, 1, 'G11285'), (3575849, 3575923, 1, 'G11395'), (3622697, 3622769, -1, 'G11505'), (3942012, 3942084, 1, 'G12385'), (3995103, 3995176, -1, 'G12585'), (4254534, 4254615, 1, 'G13223'), (4330778, 4330850, 1, 'G13335'), (4998147, 4998219, 1, 'G14855'), (5068300, 5068374, -1, 'G15055'), (5275155, 5275228, 1, 'G15585'), (5632857, 5632930, 1, 'G16552'), (6483945, 6484019, -1, 'G18815'), (6540636, 6540673, 1, 'G18952'), (6663713, 6663786, 1, 'G19235'), (7104314, 7104398, 1, 'G20365'), (7224223, 7224296, -1, 'G20655'), (7319582, 7319664, -1, 'G20885'), (7567399, 7567471, -1, 'G21475'), (9373610, 9373684, -1, 'G25715'), (9840420, 9840494, 1, 'G26747'), (10211564, 10211636, 1, 'G27555'), (10319498, 10319570, 1, 'G27825'), (10325875, 10325947, 1, 'G27845'), (10753667, 10753740, 1, 'G28685'), (10760629, 10760702, -1, 'G28695'), (11076814, 11076886, 1, 'G29095'), (11961645, 11961718, 1, 'G30345'), (16438025, 16438097, -1, 'G44955'), (16896875, 16896949, 1, 'G45935'), (16902623, 16902697, 1, 'G45955'), (16905147, 16905221, 1, 'G45965'), (17160736, 17160808, 1, 'G46585'), (17275564, 17275646, 1, 'G46875'), (17905395, 17905467, 1, 'G48275'), (17985575, 17985611, -1, 'G48515'), (18080062, 18080134, 1, 'G48745'), (18518796, 18518870, 1, 'G49925'), (18755788, 18755860, -1, 'G50505'), (18837020, 18837092, 1, 'G50665'), (18907851, 18907924, 1, 'G50835'), (18928413, 18928487, 1, 'G50895'), (19008621, 19008694, -1, 'G51135'), (19044371, 19044443, -1, 'G51265'), (19403651, 19403723, -1, 'G52285'), (19420345, 19420417, -1, 'G52345'), (19511965, 19512045, 1, 'G52565'), (19566013, 19566085, 1, 'G52765'), (19648105, 19648188, 1, 'G52955'), (19935354, 19935426, 1, 'G53775'), (19995918, 19995989, 1, 'G53965'), (20704664, 20704736, 1, 'G55735'), (20720151, 20720223, 1, 'G55795'), (20824495, 20824568, -1, 'G56085'), (21498293, 21498375, 1, 'G58035'), (21553258, 21553329, 1, 'G58165'), (21970486, 21970557, 1, 'G59415'), (22149699, 22149773, 1, 'G59923'), (22149823, 22149895, -1, 'G59926'), (22197810, 22197892, -1, 'G60075'), (22481215, 22481288, -1, 'G60805'), (22622384, 22622465, 1, 'G61105'), (22786896, 22786969, 1, 'G61545'), (22853496, 22853567, 1, 'G61715'), (22871101, 22871174, 1, 'G61755'), (22892781, 22892853, 1, 'G61825'), (23047854, 23047927, 1, 'G62245'), (23062444, 23062517, -1, 'G62285'), (23221682, 23221753, 1, 'G62735'), (23296567, 23296640, -1, 'G63003'), (23296728, 23296801, -1, 'G63006')]
        f4 = [(33799, 33872, 1, 'G00085'), (424716, 424788, -1, 'G00985'), (562560, 562634, -1, 'G01355'), (611865, 611932, -1, 'G01455'), (808269, 808342, -1, 'G01865'), (901175, 901247, 1, 'G02055'), (1390894, 1390966, 1, 'G03135'), (1442004, 1442076, 1, 'G03285'), (1501605, 1501677, 1, 'G03405'), (1520781, 1520854, -1, 'G03435'), (5268124, 5268210, -1, 'G08345'), (6646425, 6646496, 1, 'G10815'), (6819287, 6819324, 1, 'G11177'), (6837555, 6837639, -1, 'G11213'), (6837769, 6837853, -1, 'G11216'), (6905479, 6905552, -1, 'G11355'), (6944721, 6944793, 1, 'G11405'), (7185697, 7185771, 1, 'G11985'), (7232792, 7232865, -1, 'G12065'), (7256408, 7256481, 1, 'G12115'), (7341420, 7341494, -1, 'G12405'), (7730956, 7731037, 1, 'G13265'), (7814197, 7814270, 1, 'G13445'), (8255695, 8255767, 1, 'G14345'), (8301720, 8301794, -1, 'G14415'), (8979656, 8979729, 1, 'G15775'), (9108317, 9108391, 1, 'G16105'), (9191590, 9191663, 1, 'G16235'), (9287230, 9287304, 1, 'G16465'), (9289706, 9289787, 1, 'G16475'), (9815215, 9815287, -1, 'G17612'), (9873524, 9873596, -1, 'G17765'), (9978117, 9978189, -1, 'G17975'), (10093077, 10093157, -1, 'G18255'), (10302011, 10302084, 1, 'G18725'), (10325975, 10326047, -1, 'G18815'), (10878733, 10878807, -1, 'G20115'), (11774472, 11774508, -1, 'G22265'), (11910299, 11910373, 1, 'G22635'), (11954751, 11954824, -1, 'G22754'), (11974951, 11975032, 1, 'G22785'), (12320119, 12320203, 1, 'G23635'), (12429608, 12429681, 1, 'G23915'), (12486211, 12486282, -1, 'G24025'), (12686148, 12686230, 1, 'G24565'), (13006243, 13006316, -1, 'G25435'), (13058840, 13058922, -1, 'G25585'), (13076582, 13076666, -1, 'G25635'), (13285431, 13285503, -1, 'G26225'), (13336345, 13336419, -1, 'G26375'), (13341501, 13341575, -1, 'G26385'), (13454562, 13454635, 1, 'G26675'), (13704787, 13704860, 1, 'G27395'), (13882922, 13882994, -1, 'G27875'), (13885196, 13885269, -1, 'G27885'), (14032495, 14032567, 1, 'G28362'), (14267286, 14267368, 1, 'G28915'), (14470283, 14470355, 1, 'G29415'), (15120655, 15120728, 1, 'G31075'), (15183089, 15183162, 1, 'G31265'), (15345717, 15345753, -1, 'G31695'), (15430229, 15430303, -1, 'G31895'), (15576655, 15576728, 1, 'G32265'), (15671398, 15671469, 1, 'G32475'), (15804553, 15804635, 1, 'G32765'), (16304128, 16304201, 1, 'G34035'), (16454700, 16454773, -1, 'G34415'), (16556627, 16556700, 1, 'G34695'), (16655290, 16655364, 1, 'G34975'), (17130054, 17130127, 1, 'G36197'), (17149473, 17149545, 1, 'G36245'), (17276705, 17276779, -1, 'G36635'), (17500800, 17500872, -1, 'G37175'), (18254982, 18255018, -1, 'G39195'), (18293773, 18293845, 1, 'G39345'), (18395021, 18395093, 1, 'G39615'), (18411258, 18411332, 1, 'G39672'), (18501705, 18501778, -1, 'G39865'), (18542164, 18542238, 1, 'G39985')]
        f5 = [(150353, 150426, -1, 'G01365'), (389889, 389960, -1, 'G02025'), (508427, 508500, -1, 'G02385'), (530819, 530893, 1, 'G02435'), (559327, 559399, -1, 'G02505'), (588890, 588964, -1, 'G02615'), (614641, 614723, 1, 'G02725'), (642397, 642479, -1, 'G02815'), (858534, 858571, 1, 'G03445'), (862395, 862468, -1, 'G03452'), (970797, 970878, -1, 'G03705'), (984365, 984448, 1, 'G03745'), (998940, 999013, 1, 'G03775'), (1742692, 1742765, 1, 'G05795'), (1788651, 1788723, 1, 'G05945'), (1804616, 1804690, 1, 'G05985'), (1853302, 1853382, -1, 'G06125'), (2060153, 2060235, -1, 'G06685'), (2212678, 2212749, -1, 'G07135'), (2309512, 2309549, -1, 'G07315'), (2411148, 2411232, 1, 'G07625'), (2432263, 2432336, -1, 'G07675'), (2587826, 2587899, -1, 'G08075'), (2898867, 2898951, -1, 'G09345'), (2993327, 2993401, 1, 'G09655'), (3030817, 3030890, -1, 'G09755'), (3118377, 3118458, 1, 'G09975'), (3212351, 3212424, -1, 'G10235'), (3287553, 3287635, -1, 'G10455'), (3324702, 3324775, 1, 'G10525'), (3578295, 3578367, -1, 'G11225'), (3617058, 3617130, 1, 'G11325'), (3669000, 3669073, -1, 'G11475'), (4471050, 4471122, 1, 'G13845'), (4530475, 4530548, 1, 'G14035'), (4673902, 4673974, 1, 'G14495'), (4929562, 4929636, 1, 'G15175'), (5157641, 5157715, 1, 'G15805'), (5161514, 5161586, 1, 'G15815'), (5358918, 5359000, 1, 'G16375'), (5962699, 5962771, -1, 'G18005'), (5965972, 5966044, -1, 'G18015'), (5984378, 5984450, 1, 'G18085'), (6258146, 6258218, 1, 'G18755'), (6401240, 6401311, 1, 'G19095'), (7073531, 7073603, -1, 'G20852'), (7073944, 7074016, -1, 'G20854'), (7074357, 7074429, -1, 'G20856'), (7074773, 7074845, -1, 'G20858'), (7222059, 7222131, -1, 'G21378'), (7387890, 7387962, 1, 'G22315'), (7981400, 7981472, 1, 'G23665'), (8906418, 8906502, 1, 'G25585'), (8946826, 8946899, -1, 'G25625'), (9815405, 9815477, -1, 'G27715'), (11802284, 11802356, 1, 'G32017'), (13823211, 13823284, -1, 'G35605'), (15049737, 15049811, -1, 'G37795'), (15242547, 15242621, 1, 'G38155'), (15593086, 15593160, 1, 'G38905'), (15844253, 15844325, -1, 'G39535'), (15993514, 15993587, 1, 'G39895'), (16256865, 16256937, -1, 'G40545'), (16427812, 16427893, 1, 'G40945'), (16524760, 16524832, -1, 'G41265'), (16655393, 16655477, 1, 'G41605'), (16684663, 16684735, -1, 'G41675'), (17476402, 17476475, -1, 'G43455'), (17512768, 17512839, -1, 'G43535'), (17856811, 17856883, -1, 'G44283'), (17894906, 17894979, -1, 'G44375'), (18058014, 18058088, 1, 'G44705'), (18560206, 18560278, -1, 'G45715'), (18576071, 18576143, 1, 'G45745'), (18715888, 18715960, -1, 'G46105'), (18807534, 18807614, 1, 'G46325'), (18924749, 18924821, 1, 'G46595'), (19658828, 19658900, 1, 'G48465'), (19761400, 19761472, -1, 'G48675'), (19820360, 19820398, 1, 'G48835'), (20064048, 20064120, 1, 'G49435'), (20692447, 20692519, 1, 'G50805'), (20758903, 20758940, -1, 'G50995'), (20773555, 20773637, 1, 'G51055'), (21275059, 21275141, -1, 'G52355'), (21318105, 21318189, -1, 'G52495'), (21418369, 21418441, 1, 'G52815'), (21740339, 21740410, -1, 'G53487'), (22091631, 22091704, 1, 'G54365'), (22094087, 22094160, 1, 'G54375'), (22304851, 22304923, -1, 'G54865'), (22355897, 22355970, -1, 'G55045'), (22357726, 22357799, -1, 'G55055'), (22501995, 22502068, -1, 'G55505'), (22845356, 22845430, 1, 'G56365'), (22973066, 22973138, 1, 'G56745'), (23071996, 23072070, -1, 'G56975'), (23463219, 23463291, 1, 'G57885'), (23661936, 23662018, 1, 'G58495'), (23861431, 23861503, 1, 'G59055'), (23971167, 23971239, 1, 'G59385'), (23974655, 23974727, 1, 'G59395'), (24157171, 24157245, -1, 'G59945'), (24279805, 24279886, 1, 'G60285'), (24547401, 24547474, 1, 'G60963'), (24548892, 24548964, 1, 'G60966'), (24684507, 24684579, 1, 'G61345'), (24726891, 24726964, 1, 'G61445'), (24856205, 24856242, 1, 'G61835'), (25347261, 25347333, 1, 'G63145'), (25801340, 25801414, 1, 'G64505'), (25892619, 25892691, -1, 'G64735'), (25942291, 25942372, 1, 'G64855'), (25989903, 25989976, 1, 'G65015'), (26114755, 26114793, -1, 'G65305'), (26174414, 26174496, -1, 'G65445'), (26212684, 26212757, 1, 'G65535'), (26238859, 26238933, -1, 'G65615'), (26573248, 26573322, -1, 'G66535'), (26585622, 26585696, 1, 'G66568'), (26670495, 26670567, -1, 'G66755'), (26699933, 26700004, -1, 'G66817'), (26938897, 26938969, 1, 'G67455')]
        entries = [("Chr I", "NC_003070", 30432563, f1, colors.red),
                   ("Chr II", "NC_003071", 19705359, f2, colors.green),
                   ("Chr III", "NC_003074", 23470805, f3, colors.blue),
                   ("Chr IV", "NC_003075", 18585042, f4, colors.orange),
                   ("Chr V", "NC_003076", 26992728, f5, colors.purple)]
        max_length = max([row[2] for row in entries])

        chr_diagram = BasicChromosome.Organism()
        for name, acc, length, features, color in entries:
            if False:
                # How I generated the values above... and tested passing in SeqFeatures
                filename = "/Users/pjcock/Documents/comp_genomics/seed/%s.gbk" % acc
                import os
                if not os.path.isfile(filename):
                    continue
                from Bio import SeqIO
                record = SeqIO.read(filename, "gb")
                assert length == len(record)
                features = [f for f in record.features if f.type=="tRNA"]
                print(name)
                # Strip of the first three chars, AT# where # is the chr
                print([(int(f.location.start), int(f.location.end),
                        f.strand, f.qualifiers['locus_tag'][0][3:])
                       for f in features])
                # Output was copy and pasted to the script, see above.
                # Continue test using SeqFeature objects!
                # To test colours from the qualifiers,
                for i, f in enumerate(features):
                    f.qualifiers['color'] = [str(i % 16)]
            elif use_seqfeatures:
                # Features as SeqFeatures
                features = [SeqFeature(FeatureLocation(start, end, strand),
                                       qualifiers={"name": [label],
                                                   "color": [color]})
                            for (start, end, strand, label) in features]
            else:
                # Features as 5-tuples
                features = [(start, end, strand, label, color)
                            for (start, end, strand, label) in features]

            # I haven't found a nice source of data for real Arabidopsis
            # cytobands, so these three are made up at random!
            cytobands = []
            for color in [colors.gray, colors.darkgray, colors.slategray]:
                start = (length - 1000000) * random.random()
                end = min(length, start + 1000000)
                # Draw these with black borders, and a grey fill
                cytobands.append((start, end, 0, None, colors.black, color))
            # Draw these with black borders, and a brown fill:
            cytobands.append((0, 1000000, 0, "First 1 Mbp", colors.black, colors.brown))
            cytobands.append((length-1000000, length, 0, "Last 1 Mbp", colors.black, colors.brown))
            # Additional dummy entry to check fill colour on both strands,
            if name == "Chr III":
                cytobands.append((11000000, 13000000, -1, "Reverse", "red", "yellow"))
            elif name == "Chr V":
                cytobands.append((9500000, 11000000, +1, "Forward", colors.red, colors.yellow))
            # Create the drawing object for the chromosome
            cur_chromosome = BasicChromosome.Chromosome(name)
            # Set the length, adding an extra 20 percent for the tolomeres etc:
            cur_chromosome.scale_num = max_length * 1.2
            cur_chromosome.label_sep_percent = 0.15
            # Add a dummy segment for allocating vertical space
            # which can be used for feature label placement
            spacer = BasicChromosome.SpacerSegment()
            spacer.scale = 0.03 * max_length
            cur_chromosome.add(spacer)
            # Add an opening telomere
            start = BasicChromosome.TelomereSegment()
            start.scale = 0.02 * max_length
            start.fill_color = colors.lightgrey
            cur_chromosome.add(start)
            # Add a body - using bp as the scale length here.
            # Note we put the cytobands a start of combined list,
            # as want them drawn underneath the tRNA markers.
            body = BasicChromosome.AnnotatedChromosomeSegment(length, cytobands + features)
            body.scale = length
            cur_chromosome.add(body)
            # Add a closing telomere
            end = BasicChromosome.TelomereSegment(inverted=True)
            end.scale = 0.02 * max_length
            end.fill_color = colors.lightgrey
            cur_chromosome.add(end)
            # Another spacer
            spacer = BasicChromosome.SpacerSegment()
            spacer.scale = 0.03 * max_length
            cur_chromosome.add(spacer)
            # This chromosome is done
            chr_diagram.add(cur_chromosome)
        with warnings.catch_warnings():
            # BiopythonWarning: Too many labels to avoid overlap
            warnings.simplefilter("ignore", BiopythonWarning)
            chr_diagram.draw(filename, "Arabidopsis thaliana tRNA")
예제 #17
0
def prodigal_parser(seq_file, sco_file, prefix, output_folder):

    bin_ffn_file = '%s.ffn' % prefix
    bin_faa_file = '%s.faa' % prefix
    bin_gbk_file = '%s.gbk' % prefix
    pwd_bin_ffn_file = '%s/%s' % (output_folder, bin_ffn_file)
    pwd_bin_faa_file = '%s/%s' % (output_folder, bin_faa_file)
    pwd_bin_gbk_file = '%s/%s' % (output_folder, bin_gbk_file)

    # get sequence id list
    id_to_sequence_dict = {}
    sequence_id_list = []
    for each_seq in SeqIO.parse(seq_file, 'fasta'):
        id_to_sequence_dict[each_seq.id] = str(each_seq.seq)
        sequence_id_list.append(each_seq.id)

    # get sequence to cds dict and sequence to transl_table dict
    current_seq_id = ''
    current_transl_table = ''
    current_seq_csd_list = []
    seq_to_cds_dict = {}
    seq_to_transl_table_dict = {}
    for each_cds in open(sco_file):
        if each_cds.startswith('# Sequence Data'):

            # add to dict
            if current_seq_id != '':
                seq_to_cds_dict[current_seq_id] = current_seq_csd_list
                seq_to_transl_table_dict[current_seq_id] = current_transl_table

            # reset value
            current_seq_id = each_cds.strip().split('=')[-1][1:-1].split(
                ' ')[0]
            current_transl_table = ''
            current_seq_csd_list = []

        elif each_cds.startswith('# Model Data'):
            current_transl_table = each_cds.strip().split(';')[-2].split(
                '=')[-1]

        else:
            current_seq_csd_list.append('_'.join(
                each_cds.strip().split('_')[1:]))

    seq_to_cds_dict[current_seq_id] = current_seq_csd_list
    seq_to_transl_table_dict[current_seq_id] = current_transl_table

    bin_gbk_file_handle = open(pwd_bin_gbk_file, 'w')
    bin_ffn_file_handle = open(pwd_bin_ffn_file, 'w')
    bin_faa_file_handle = open(pwd_bin_faa_file, 'w')
    gene_index = 1
    for seq_id in sequence_id_list:

        # create SeqRecord
        current_sequence = Seq(id_to_sequence_dict[seq_id])
        current_SeqRecord = SeqRecord(current_sequence, id=seq_id)
        current_SeqRecord.seq.alphabet = generic_dna
        transl_table = seq_to_transl_table_dict[seq_id]

        # add SeqFeature to SeqRecord
        for cds in seq_to_cds_dict[seq_id]:

            # define locus_tag id
            locus_tag_id = '%s_%s' % (prefix, "{:0>5}".format(gene_index))

            # define FeatureLocation
            cds_split = cds.split('_')
            cds_start = SF.ExactPosition(int(cds_split[0]))
            cds_end = SF.ExactPosition(int(cds_split[1]))
            cds_strand = cds_split[2]
            current_strand = None
            if cds_strand == '+':
                current_strand = 1
            if cds_strand == '-':
                current_strand = -1
            current_feature_location = FeatureLocation(cds_start,
                                                       cds_end,
                                                       strand=current_strand)

            # get nc sequence
            sequence_nc = ''
            if cds_strand == '+':
                sequence_nc = id_to_sequence_dict[seq_id][cds_start -
                                                          1:cds_end]
            if cds_strand == '-':
                sequence_nc = str(
                    Seq(id_to_sequence_dict[seq_id][cds_start - 1:cds_end],
                        generic_dna).reverse_complement())

            # translate to aa sequence
            sequence_aa = str(
                SeqRecord(Seq(sequence_nc)).seq.translate(table=transl_table))

            # remove * at the end
            sequence_aa = sequence_aa[:-1]

            # export nc and aa sequences
            export_dna_record(sequence_nc, locus_tag_id, '',
                              bin_ffn_file_handle)
            export_aa_record(sequence_aa, locus_tag_id, '',
                             bin_faa_file_handle)

            # Define feature type
            current_feature_type = 'CDS'

            # Define feature qualifiers
            current_qualifiers_dict = {}
            current_qualifiers_dict['locus_tag'] = locus_tag_id
            current_qualifiers_dict['transl_table'] = transl_table
            current_qualifiers_dict['translation'] = sequence_aa

            # Create a SeqFeature
            current_feature = SeqFeature(current_feature_location,
                                         type=current_feature_type,
                                         qualifiers=current_qualifiers_dict)

            # Append Feature to SeqRecord
            current_SeqRecord.features.append(current_feature)
            gene_index += 1

        # export to gbk file
        SeqIO.write(current_SeqRecord, bin_gbk_file_handle, 'genbank')

    bin_gbk_file_handle.close()
    bin_ffn_file_handle.close()
    bin_faa_file_handle.close()
예제 #18
0
    def merge_overlapping_feature_in_simple_format(
            self,
            input_file_file_list,
            scaffold_id_column,
            feature_start_column,
            feature_end_column,
            output_file=None,
            output_separator="\t",
            comments_prefix="#",
            input_separator="\t",
            coordinates_type="1-based",
            return_seqfeature_dict=False,
            feature_type=None):

        file_list = [input_file_file_list] if isinstance(
            input_file_file_list, str) else input_file_file_list

        record_dict_list = []

        for filename in file_list:
            record_dict_list.append(OrderedDict())
            for line_list in self.file_line_as_list_generator(
                    filename,
                    comments_prefix=comments_prefix,
                    separator=input_separator):
                if line_list[scaffold_id_column] not in record_dict_list[-1]:
                    record_dict_list[-1][line_list[scaffold_id_column]] = []
                record_dict_list[-1][line_list[scaffold_id_column]].append([
                    (int(line_list[feature_start_column]) -
                     1 if coordinates_type == "1-based" else int(
                         line_list[feature_start_column])),
                    int(line_list[feature_end_column])
                ])

        unified_dict = OrderedDict()
        merged_dict = OrderedDict()

        #print record_dict_list[0]

        scaffold_set = set()
        for record_dict in record_dict_list:
            scaffold_set |= set(record_dict.keys())

        for scaffold in scaffold_set:
            unified_dict[scaffold] = []
            merged_dict[scaffold] = []

        for record_dict in record_dict_list:
            for scaffold in record_dict:

                unified_dict[scaffold] += record_dict[scaffold]
                #print "AAAAAAAAAA"
                #print scaffold, unified_dict[scaffold], record_dict[scaffold]

        for scaffold in unified_dict:
            if unified_dict[scaffold]:
                unified_dict[scaffold].sort()
            if unified_dict[scaffold] is None:
                print(scaffold)

        #print unified_dict

        for scaffold in unified_dict:

            number_of_records = len(unified_dict[scaffold])
            if number_of_records == 0:
                continue

            # [a, b) [c, d), a < b, c < d
            # after sorting c >= a
            i = 1

            prev_coordinates = deepcopy(unified_dict[scaffold][0])

            #print scaffold, number_of_records, prev_coordinates
            #print "\t", unified_dict[scaffold]

            while i < number_of_records:
                if unified_dict[scaffold][i][0] > prev_coordinates[1]:  # c > b
                    #print "AAAAAA", "\t", prev_coordinates, unified_dict[scaffold][i]
                    merged_dict[scaffold].append(deepcopy(prev_coordinates))
                    prev_coordinates = deepcopy(unified_dict[scaffold][i])

                elif unified_dict[scaffold][i][1] > prev_coordinates[
                        1]:  # d > b; c<=b
                    #print "BBBBBB", "\t",prev_coordinates, unified_dict[scaffold][i]
                    prev_coordinates[1] = deepcopy(
                        unified_dict[scaffold][i][1])
                else:  # d <= b
                    #print "CCCCCC", "\t",prev_coordinates, unified_dict[scaffold][i]
                    pass
                i += 1
            if merged_dict[scaffold]:
                if prev_coordinates != merged_dict[scaffold][-1]:
                    merged_dict[scaffold].append(prev_coordinates)
            else:
                merged_dict[scaffold].append(prev_coordinates)

            #print "\t", unified_dict[scaffold]
            #print "\t", merged_dict[scaffold]
        #print unified_dict
        #print merged_dict
        if output_file:
            with self.metaopen(output_file, "w") as out_fd:
                for scaffold in merged_dict:
                    for feature in merged_dict[scaffold]:
                        out_fd.write(
                            output_separator.join(
                                map(str, [
                                    scaffold, feature[0] +
                                    1 if coordinates_type ==
                                    "1-based" else feature[0], feature[1]
                                ])) + "\n")

        if return_seqfeature_dict and feature_type:
            feature_dict = OrderedDict()
            for region in merged_dict:
                feature_dict[region] = []
                for (start, stop) in merged_dict[region]:
                    feature_dict[region].append(
                        SeqFeature(FeatureLocation(start, stop),
                                   type=feature_type,
                                   strand=None))
            return feature_dict
        elif return_seqfeature_dict and (not feature_type):
            raise ValueError(
                "ERROR!!! Feature type for seqfeature records was not set!")
        else:
            return merged_dict
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
at5g40780 = Seq(
    "MVAQAPHDDHQDDEKLAAARQKEIEDWLPITSSRNAKWWYSAFHNVTAMVGAGVLGLPYAMSQLGWGPGIAVLVLSWVITLYTLWQMVEMHEMVPGKRFDRYHELGQHAFGEKLGLYIVVPQQLIVEIGVCIVYMVTGGKSLKKFHELVCDDCKPIKLTYFIMIFASVHFVLSHLPNFNSISGVSLAAAVMSLSYSTIAWASSASKGVQEDVQYGYKAKTTAGTVFNFFSGLGDVAFAYAGHNVVLEIQATIPSTPEKPSKGPMWRGVIVAYIVVALCYFPVALVGYYIFGNGVEDNILMSLKKPAWLIATANIFVVIHVIGSYQIYAMPVFDMMETLLVKKLNFRPTTTLRFFVRNFYVAATMFVGMTFPFFGGLLAFFGGFAFAPTTYFLPCVIWLAIYKPKKYSLSWWANWVCIVFGLFLMVLSPIGGLRTIVIQAKGYKFYS"
)
# 단백질 시퀀스 왜 이렇게 길어 이거
feature = SeqFeature(FeatureLocation(0, 30), type="protein", strand=1)
feature_seq = at5g40780[feature.location.start:feature.location.end]
print(feature_seq)
# 아래나 위나 방식은 똑같은 듯 하다.
feature_seq2 = feature.extract(at5g40780)
print(feature_seq2)
예제 #20
0
        if int(rw[ti['Amplicon_length']]) > 0:
            start = int(rw[ti['Primer1_target_start']])
            end = int(rw[ti['Primer2_target_end']])
            if end > start:
                strd = 1
                truestart = start
                trueend = end
            else:
                strd = -1
                truestart = end
                trueend = start
            #print '{} Start:{}, End:{}'.format(indexa,start,end)
            seq_feature=SeqFeature(FeatureLocation(truestart,trueend, strand=strd), type="Amplicon", \
                                   id=indexa,\
                                   qualifiers={'Plate':plate,'Well':well,'Gene':gene_name,\
                                               'Amplicon':amplicon,\
                                               'Amplicons':amplicons,\
                                               'Strand':strd,'colour':col,\
			                                   #'note':'{} {}'.format(indexa,data[key]['Description']),\
                                               'Strand_annotation':data[key]['Strand_annotation']})
            genome.features.append(seq_feature)

faf = open(amplicons_features, 'w')
SeqIO.write(genome, faf, "gb")
faf.close()

#Map TF binding sites

TF_file = "../Transcription_Factors/TF_binding_clean.csv"

TF_data = readcsv(TF_file, delim=',')
예제 #21
0
def annotate_geneclusters(seq_record, options):
    """Re-annotate gene clusters in the seq_record"""
    pfam_features = utils.get_pfam_features(seq_record)
    cf_clusters = find_cf_clusters(pfam_features, seq_record, options)
    # Integrate ClusterFinder clusters with existing cluster features
    newclusters = []
    cluster_features = utils.get_cluster_features(seq_record)
    secmet_cds_features = utils.get_secmet_cds_features(seq_record)

    for cf_cluster in cf_clusters:
        overlaps = False
        cf_type = "cf_putative"
        for cluster in cluster_features:
            if not utils.features_overlap(cf_cluster, cluster):
                continue

            overlaps = True

            # Get signature genes from antiSMASH-predicted cluster
            features_in_cluster = utils.get_cluster_cds_features(
                cluster, seq_record)
            cluster_sig_genes = [
                gene for gene in secmet_cds_features
                if gene in features_in_cluster
            ]

            # Predict gene cluster borders using ClusterFinder
            if options.borderpredict:
                if ((cluster.location.end + cluster.location.start) /
                        2) in cf_cluster.location:
                    # Make sure that antiSMASH signature genes are still included in the cluster
                    for sig_gene in cluster_sig_genes:
                        startpoint = min(
                            [sig_gene.location.start, sig_gene.location.end])
                        endpoint = max(
                            [sig_gene.location.start, sig_gene.location.end])
                        if cf_cluster.location.start > startpoint:
                            cf_cluster.location = FeatureLocation(
                                startpoint, cf_cluster.location.end)
                        if cf_cluster.location.end < endpoint:
                            cf_cluster.location = FeatureLocation(
                                cf_cluster.location.start, endpoint)
                    cluster_border = SeqFeature(cf_cluster.location,
                                                type="cluster_border")
                    cluster_border.qualifiers = {
                        "tool": ["clusterfinder"],
                        "probability": [cf_cluster.probability],
                        "note": ["best prediction"],
                    }
                    seq_record.features.append(cluster_border)
            elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end:
                cluster.location = cf_cluster.location
            elif cf_cluster.location.start < cluster.location.start:
                cluster.location = FeatureLocation(cf_cluster.location.start,
                                                   cluster.location.end)
            elif cf_cluster.location.end > cluster.location.end:
                cluster.location = FeatureLocation(cluster.location.start,
                                                   cf_cluster.location.end)
            cluster.qualifiers['probability'] = [
                "%01.4f" % cf_cluster.probability
            ]
        if not overlaps and not ('borderpredict_only' in options
                                 and options.borderpredict_only):
            cf_cluster_CDSs = utils.get_cluster_cds_features(
                cf_cluster, seq_record)
            for CDS in cf_cluster_CDSs:
                if 'sec_met' in CDS.qualifiers:
                    type_sec_met_qualifiers = [
                        feat for feat in CDS.qualifiers['sec_met']
                        if "Type: " in feat
                    ]
                    for qualifier in type_sec_met_qualifiers:
                        if "cf_fatty_acid" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_fatty_acid"
                            elif cf_type == "cf_saccharide":
                                cf_type = "cf_fatty_acid-saccharide"
                        if "cf_saccharide" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_saccharide"
                            elif cf_type == "cf_fatty_acid":
                                cf_type = "cf_fatty_acid-saccharide"
            new_cluster = SeqFeature(cf_cluster.location, type="cluster")
            new_cluster.qualifiers['product'] = [cf_type]
            new_cluster.qualifiers['probability'] = [
                "%01.4f" % cf_cluster.probability
            ]
            newclusters.append(new_cluster)

    if len(newclusters):
        seq_record.features.extend(newclusters)
        renumber_clusters(seq_record, options)
예제 #22
0
                ],
                "EC_number":
                "3.5.2.6",
            }
        ampr_prom = next(get_features("AmpR promoter"), None)
        ampr_prom = ampr_prom or next(get_features("AmpR Promoter"), None)
        if ampr_prom is not None:
            ampr_prom.qualifiers["label"] = ["AmpR Promoter"]
            ampr_prom.qualifiers["note"] = ["color: #ff6666"]
        ampr_term_start = gb.seq.find(AMPR_TERM)
        if ampr is not None and ampr_term_start >= 0:
            ampr_term = SeqFeature(
                location=FeatureLocation(ampr_term_start, ampr_term_start + 94,
                                         -1),
                type="terminator",
                qualifiers={
                    "label": "AmpR Terminator",
                    "note": ["color: #ff6666"]
                },
            )
            gb.features.append(ampr_term)

        # KanR recolor and annotations
        kanr = next(get_features("KanR"), None)
        if kanr is not None:
            kanr.qualifiers.update({
                "gene":
                "aphA1",
                "product":
                "aminoglycoside phosphotransferase",
                "EC_number":
예제 #23
0
        defline = protein.description
        pattern = re.compile('.+#\s(\d+)\s#\s(\d+)\s#\s(\S*1)\s#\sID.+')
        match = pattern.match(defline)
        start_pos = int(match.group(1))
        end_pos = int(match.group(2))
        strand_pos = int(match.group(3))
        feat_loc = FeatureLocation(start_pos - 1,
                                   end_pos)  # adjust for 0-index
        l_tag = protein.id
        # consolidate feature annotations
        quals = {
            'note': defline,
            'locus_tag': l_tag,
            'translation': protein.seq
        }
        feature = SeqFeature(location=feat_loc,
                             strand=strand_pos,
                             id=protein.id,
                             type='CDS',
                             qualifiers=quals)
        record.features.append(feature)

    # save record with annotations
    record.description = rec_name + "_with_ORFs"
    record.name = rec_name
    record.dbxrefs = ["Project: " + argv[1] + "/" + rec_name]
    record.seq.alphabet = generic_dna
    write_genbank(annot_gbk, record)

    print "OK"
예제 #24
0
seq = Dseqrecord("")

colors = []

for hue in range(0, 360, 36):
    for s in [0.2, 0.4, 0.6]:
        r, g, b = colorsys.hsv_to_rgb(hue / 360, s, 255)
        r, g, b = int(r), int(g), int(b)
        hex = "#{0:02x}{1:02x}{2:02x}".format(r, g, b)
        colors.append(f"{hex}")

colors = colors[::3] + colors[1::3] + colors[2::3]
colors = colors[::-1]
for hex in colors:
    sf = SeqFeature(FeatureLocation(1, 9, strand=1), type="misc_feature")
    sf.qualifiers["label"] = [
        hex,
    ]
    sf.qualifiers["ApEinfo_fwdcolor"] = [
        hex,
    ]
    se = Dseqrecord("agtagtcgta")
    se.features.append(sf)
    seq += se

from pydna.editor import ape

ape(seq)

print(colors)
예제 #25
0
def printClusterPics(cluNet, data_box, annot_def_dict, output_dir, ncutoff=8):
    """This abomination takes in a network of gene clusters, a dict of cluster_member_ID:pfam domains,
       and a place to dump output.  It goes thru and prints pictures of each cluster and its best buddies.
       Possible defects:
       Proteins can be related, but won't be colored if there is no pfam domain annotation.
       Handling of multidomain proteins is very crude.
       Also, much much too long."""
    for node in cluNet.nodes():
        #record the species for file printing poirposes
        parentSpecies, parentAcession, parentWpID, parentAnnotation, parentStartstop = node.split(
            "|")
        #Make a new diagram.  TODO add the path for the output and whatnot.
        gd_diagram = GenomeDiagram.Diagram()
        #Return the top most related clusters.
        sortedClusters = getTopClusters(cluNet, node)[0]
        #the largest nucleotide span in all the clusters under consideration.
        maxSpan = getMaxSpan(sortedClusters, data_box)
        #Get info about pfam domains in all the clusters under consideration
        id_list = makeIDList(sortedClusters, data_box)
        domainCounts = getPfamCounts(annot_def_dict, id_list)
        colorDict = getColorDict(domainCounts)
        # for each of the best clusters:
        for e in sortedClusters:
            #Make the track and add a feature set to the track. scale = 0 turns off the scale, greytrack adds a background.  What is the 1 for?  Who knows.
            gd_track_for_features = gd_diagram.new_track(1,
                                                         name="",
                                                         scale=0,
                                                         greytrack=True)
            gd_feature_set = gd_track_for_features.new_set()
            #get the cluster_member_IDs from data_box using the cluster_ID
            #TODO this will be replaced with a clusterID:clusterMemberID dict based on the BLAST2 db? maybe.
            clusterMembers = data_box[e].keys()
            #start stop span
            sss = getClustSS(clusterMembers)
            offset = int((maxSpan - sss[2]) / 2 - sss[0])
            for cm in clusterMembers:
                organism, acession, wpID, annotation, startstop = cm.split("|")
                startstop = re.sub('[<>$%^&*#|\[\]]', "", startstop)
                q_start = int(startstop.split("-")[0])
                q_stop = int(startstop.split("-")[1])
                #encode the directional info.
                if q_start < q_stop:
                    q_strand = 1
                    label_angle = 15
                elif q_start > q_stop:
                    q_strand = -1
                    label_angle = 179.9
                #else:
                #    strand = None
                #Uh, I think this should make the whole shebang roughly centered?
                feature = SeqFeature(FeatureLocation(q_start + offset,
                                                     q_stop + offset),
                                     ref=wpID,
                                     strand=q_strand)
                #What color should it be?
                pfam = domainHash(annot_def_dict[cm])
                if len(pfam) > 0 and pfam in colorDict:
                    color = colorDict[
                        pfam]  #pfam is a tuple of domain names, usually just one
                else:
                    color = "0x000000"  #paint it black
                gd_feature_set.add_feature(feature,
                                           sigil="ARROW",
                                           color=color,
                                           label=True,
                                           label_position="start",
                                           name=feature.ref,
                                           label_size=7,
                                           label_angle=label_angle)
        #TODO Add a legend at the bottom, too. each pfam dom is a colored square. ref = name of pfam domain.
        gd_track_for_features = gd_diagram.new_track(0,
                                                     name="",
                                                     scale=0,
                                                     greytrack=False)
        gd_feature_set = gd_track_for_features.new_set()
        counter = 0
        increment = maxSpan / len(colorDict.keys())
        for domain_tup in colorDict:
            q_start = round(counter * increment)
            q_stop = round((counter + .33) * increment)
            counter += 1
            feature = SeqFeature(FeatureLocation(q_start, q_stop),
                                 ref=domain_tup[0],
                                 strand=None)
            gd_feature_set.add_feature(feature,
                                       sigil="BOX",
                                       color=colorDict[domain_tup],
                                       label=True,
                                       name=feature.ref,
                                       label_size=7,
                                       label_angle=15)
        #Done reading in all the info! Now write the plot
        name = abspath(output_dir + "/" + parentSpecies + "-" + parentWpID +
                       ".pdf")
        drawMap(gd_diagram, name, plotWidth=maxSpan)
예제 #26
0
    def __rshift__(self, index):
        """Rotate the sequence clockwise, preserving annotations.
        """
        index %= len(self.seq)  # avoid unnecessary cycles

        if index == 0:
            return self
        elif index < 0:
            return self << -index

        newseq = self.seq[-index:] + self.seq[:-index]
        newfeats = []
        newletan = {
            k: v[index:] + v[:index]
            for k, v in six.iteritems(self.letter_annotations)
        }

        for feature in self.features:
            loc = feature.location
            if loc is None:
                newloc = None
            elif feature.type == "source" and loc.start == 0 and loc.end == len(
                    self):
                newloc = loc
            else:
                _newloc = []
                for part in (loc + index).parts:
                    if part.end >= len(newseq) and part.start >= len(newseq):
                        r = part.start // len(newseq)  # remainder is used to
                        _newloc.append(
                            FeatureLocation(  # make sure that part.end
                                start=part.start -
                                r * len(newseq),  # is always after part.start
                                end=part.end -
                                r * len(newseq),  # even on additional end
                                strand=part.strand,  # overlap
                                ref=part.ref,
                                ref_db=part.ref_db,
                            ))
                    else:
                        _newloc.append(part)
                newloc = _newloc[0] if len(_newloc) == 1 else CompoundLocation(
                    _newloc)
            newfeats.append(
                SeqFeature(
                    location=newloc,
                    type=feature.type,
                    id=feature.id,
                    qualifiers=feature.qualifiers,
                ))

        return type(self)(
            seq=newseq,
            id=self.id,
            name=self.name,
            description=self.description,
            dbxrefs=self.dbxrefs,
            features=newfeats,
            annotations=self.annotations,
            letter_annotations=newletan,
        )
예제 #27
0
def merge_gbk(gbk_records, filter_size=0, gi=False):

    '''

    merge multiple contigs into a single DNA molecule with 200*N between contigs
    keep source description from the first record
    remove contigs smaller than <filter_size>

    :param gbk_records:
    :param filter_size:
    :param gi:
    :return:
    '''

    from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition
    from Bio.SeqRecord import SeqRecord
    n=0
    if len(gbk_records) == 1:
        merged_rec = gbk_records[0]


    else:
        for i, rec in enumerate(gbk_records):
            # remove source feature of all records except the first one
            if rec.features[0].type == 'source' and i != 0:
                rec.features.pop(0)
            # filter small contigs
            if len(rec) > filter_size:
                if n == 0:
                    n+=1
                    merged_rec = rec
                else:
                    merged_rec+=rec
                # you could insert a spacer if needed
                # do not add spacer after the last contig
                if i != len(gbk_records)-1:
                    merged_rec += "N" * 200

                    my_start_pos = ExactPosition(len(merged_rec)-200)
                    my_end_pos = ExactPosition(len(merged_rec))
                    my_feature_location = FeatureLocation(my_start_pos, my_end_pos)
                    my_feature = SeqFeature(my_feature_location, type="assembly_gap")
                    merged_rec.features.append(my_feature)

    try:
        merged_rec.id = gbk_records[0].annotations["accessions"][-1]
    except KeyError:
        merged_rec.id = gbk_records[0].id

    if gi:
        merged_rec.annotations["gi"] = gi

    merged_rec.description = "%s" % gbk_records[0].annotations["organism"]
    merged_rec.annotations = gbk_records[0].annotations                                             
    try:
        merged_rec.name = gbk_records[0].annotations["accessions"][-1]
    except KeyError:
        merged_rec.name = gbk_records[0].id
    my_start_pos = ExactPosition(0)
    my_end_pos = ExactPosition(len(merged_rec))
    merged_rec.features[0].location = FeatureLocation(my_start_pos, my_end_pos)

    return merged_rec