Python SeqFeature.CompoundLocation Beispiele

Programmiersprache: Python

Namespace / Paketname: Bio

Klasse / Typ: SeqFeature

Methode / Funktion: CompoundLocation

Beispiele auf hotexamples.com: 2

Python SeqFeature.CompoundLocation - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die Bio.SeqFeature.CompoundLocation, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

FeatureLocation(30)

SeqFeature(30)

ExactPosition(19)

AfterPosition(8)

Reference(8)

BeforePosition(7)

UnknownPosition(4)

BetweenPosition(3)

UncertainPosition(3)

extract(3)

CompoundLocation(2)

WithinPosition(2)

qualifiers(2)

Beispiel #1

Datei anzeigen

def _retrieve_features(adaptor, primary_id):
    sql = "SELECT seqfeature_id, type.name, rank" \
          " FROM seqfeature join term type on (type_term_id = type.term_id)" \
          " WHERE bioentry_id = %s" \
          " ORDER BY rank"
    results = adaptor.execute_and_fetchall(sql, (primary_id,))
    seq_feature_list = []
    for seqfeature_id, seqfeature_type, seqfeature_rank in results:
        # Get qualifiers [except for db_xref which is stored separately]
        qvs = adaptor.execute_and_fetchall(
            "SELECT name, value"
            " FROM seqfeature_qualifier_value  join term using (term_id)"
            " WHERE seqfeature_id = %s"
            " ORDER BY rank", (seqfeature_id,))
        qualifiers = {}
        for qv_name, qv_value in qvs:
            qualifiers.setdefault(qv_name, []).append(qv_value)
        # Get db_xrefs [special case of qualifiers]
        qvs = adaptor.execute_and_fetchall(
            "SELECT dbxref.dbname, dbxref.accession"
            " FROM dbxref join seqfeature_dbxref using (dbxref_id)"
            " WHERE seqfeature_dbxref.seqfeature_id = %s"
            " ORDER BY rank", (seqfeature_id,))
        for qv_name, qv_value in qvs:
            value = "%s:%s" % (qv_name, qv_value)
            qualifiers.setdefault("db_xref", []).append(value)
        # Get locations
        results = adaptor.execute_and_fetchall(
            "SELECT location_id, start_pos, end_pos, strand"
            " FROM location"
            " WHERE seqfeature_id = %s"
            " ORDER BY rank", (seqfeature_id,))
        locations = []
        # convert to Python standard form
        # Convert strand = 0 to strand = None
        # re: comment in Loader.py:
        # Biopython uses None when we don't know strand information but
        # BioSQL requires something (non null) and sets this as zero
        # So we'll use the strand or 0 if Biopython spits out None
        for location_id, start, end, strand in results:
            if start:
                start -= 1
            if strand == 0:
                strand = None
            if strand not in (+1, -1, None):
                raise ValueError("Invalid strand %s found in database for "
                                 "seqfeature_id %s" % (strand, seqfeature_id))
            if end < start:
                import warnings
                from Bio import BiopythonWarning
                warnings.warn("Inverted location start/end (%i and %i) for "
                              "seqfeature_id %s" % (start, end, seqfeature_id),
                              BiopythonWarning)
            locations.append((location_id, start, end, strand))
        # Get possible remote reference information
        remote_results = adaptor.execute_and_fetchall(
            "SELECT location_id, dbname, accession, version"
            " FROM location join dbxref using (dbxref_id)"
            " WHERE seqfeature_id = %s", (seqfeature_id,))
        lookup = {}
        for location_id, dbname, accession, version in remote_results:
            if version and version != "0":
                v = "%s.%s" % (accession, version)
            else:
                v = accession
            # subfeature remote location db_ref are stored as a empty string when
            # not present
            if dbname == "":
                dbname = None
            lookup[location_id] = (dbname, v)

        feature = SeqFeature.SeqFeature(type=seqfeature_type)
        # Store the key as a private property
        feature._seqfeature_id = seqfeature_id
        feature.qualifiers = qualifiers
        if len(locations) == 0:
            pass
        elif len(locations) == 1:
            location_id, start, end, strand = locations[0]
            # See Bug 2677, we currently don't record the location_operator
            # For consistency with older versions Biopython, default to "".
            feature.location_operator = \
                _retrieve_location_qualifier_value(adaptor, location_id)
            dbname, version = lookup.get(location_id, (None, None))
            feature.location = SeqFeature.FeatureLocation(start, end)
            feature.strand = strand
            feature.ref_db = dbname
            feature.ref = version
        else:
            sub_features = feature.sub_features
            assert sub_features == []
            for location in locations:
                location_id, start, end, strand = location
                dbname, version = lookup.get(location_id, (None, None))
                subfeature = SeqFeature.SeqFeature()
                subfeature.type = seqfeature_type
                subfeature.location = SeqFeature.FeatureLocation(start, end)
                # subfeature.location_operator = \
                #    _retrieve_location_qualifier_value(adaptor, location_id)
                subfeature.strand = strand
                subfeature.ref_db = dbname
                subfeature.ref = version
                sub_features.append(subfeature)
            # Locations are in order, but because of remote locations for
            # sub-features they are not necessarily in numerical order:
            strands = set(sf.strand for sf in sub_features)
            if len(strands) == 1 and -1 in strands:
                # Evil hack time for backwards compatibility
                # TODO - Check if BioPerl and (old) Biopython did the same,
                # we may have an existing incompatibility lurking here...
                locs = [f.location for f in sub_features[::-1]]
            else:
                # All forward, or mixed strands
                locs = [f.location for f in sub_features]
            feature.location = SeqFeature.CompoundLocation(
                locs, seqfeature_type)
            # TODO - See Bug 2677 - we don't yet record location_operator,
            # so for consistency with older versions of Biopython default
            # to assuming its a join.
            feature.location_operator = "join"
        seq_feature_list.append(feature)

    return seq_feature_list

Beispiel #2

Datei anzeigen

def gff_to_gbk(genome_fasta, prot_fasta, annot_table, gff_file, species_name,
               gbk_out):
    """
    From a genome fasta (containing each contigs of the genome),
    a protein fasta (containing each protein sequence),
    an annotation table (containing gene name associated with GO terms, InterPro and EC),
    a gff file (containing gene, exon, mRNA, ncRNA, tRNA),
    a contig information table (containing species name, taxon ID, ..)
    create a genbank file.
    """

    print('Creating GFF database (gffutils)')
    # Create the gff database file.
    # gffutils use sqlite3 file-based database to access data inside GFF.
    # ':memory:' ask gffutils to keep database in memory instead of writting in a file.
    gff_database = gffutils.create_db(gff_file,
                                      ':memory:',
                                      force=True,
                                      keep_order=True,
                                      merge_strategy='merge',
                                      sort_attribute_values=True)

    # Length of your gene ID.
    # Catch it in the GFF database.
    # It's pretty dumb as we go into a loop for one information.
    # But I don't find another way to catch the length of gene_id.
    length_gene_id = 0

    for gene in gff_database.features_of_type('gene'):
        length_gene_id = len(gene.id.replace('gene:', ''))
        break

    # Get the longest contig ID to check if all contig IDs have the
    # same length, if not add 0 (at the supposed position of the number).
    longest_contig_id = ""

    for contig_for_length_id in gff_database.features_of_type(
            'sequence_assembly'):
        if len(longest_contig_id) < len(contig_for_length_id.id):
            longest_contig_id = contig_for_length_id.id

    print('Formatting fasta and annotation file')
    # Dictionary with scaffold/chromosome id as key and sequence as value.
    contig_seqs = OrderedDict()

    for record in SeqIO.parse(genome_fasta, "fasta"):
        id_contig = record.id
        contig_seqs[id_contig] = record.seq

    # Dictionary with gene id as key and protein sequence as value.
    gene_protein_seq = {}

    for record in SeqIO.parse(prot_fasta, "fasta"):
        gene_protein_seq[record.id] = record.seq

    # Create a taxonomy dictionary querying the EBI.
    species_informations = create_taxonomic_data(species_name)

    # Read a tsv file containing GO terms, Interpro and EC associated with gene name.
    mapping_data = pa.read_csv(annot_table, sep='\t')
    mapping_data.replace(np.nan, '', inplace=True)

    gene_column, go_column, ec_column, ipr_column = find_column_of_interest(
        mapping_data)

    mapping_data.set_index(gene_column, inplace=True)
    # Dictionary with gene id as key and GO terms/Interpro/EC as value.
    annot_GOs = mapping_data[go_column].to_dict()
    annot_IPRs = mapping_data[ipr_column].to_dict()
    annot_ECs = mapping_data[ec_column].to_dict()

    # Query Gene Ontology to extract namespaces and alternative IDs.
    df_go_namespace, df_go_alternative = create_GO_dataframes()
    # Dictionary GO id as term and GO namespace as value.
    df_go_namespace.set_index('GO', inplace=True)
    go_namespaces = df_go_namespace['namespace'].to_dict()

    # Dictionary GO id as term and GO alternatives id as value.
    df_go_alternative.set_index('GO', inplace=True)
    go_alternatives = df_go_alternative['alternative_GO'].to_dict()

    # Create a dataframe containing each exon with informations (gene, start, end and strand)
    df_exons = pa.DataFrame(
        columns=['exon_id', 'gene_id', 'start', 'end', 'strand'])

    print('Searching for exons')

    temporary_datas = []

    # Search for all exons in gff database and extract start position (have to minus one to get the right position)
    # the end position, the strand (have to change from str to int) and the gene ID.
    # Then add it to a list of dictionary that will be added to the dataframe.
    for exon in gff_database.features_of_type('exon'):
        start_position = exon.start - 1
        end_position = exon.end
        strand = strand_change(exon.strand)

        gene_id = exon.id.replace('exon:', '')[:-2]
        temporary_datas.append({
            'exon_id': exon.id,
            'gene_id': gene_id,
            'start': start_position,
            'end': end_position,
            'strand': strand
        })

    df_exons = df_exons.append(temporary_datas)

    # All SeqRecord objects will be stored in a list and then give to the SeqIO writer to create the genbank.
    seq_objects = []

    print('Assembling Genbank informations')

    # Iterate through each contig.
    # Then iterate through gene and throug RNA linked with the gene.
    # Then look if protein informations are available.
    for contig_id in sorted(contig_seqs):
        # Data for each contig.
        record = contig_info(contig_id, contig_seqs[contig_id],
                             species_informations)
        for gene in gff_database.features_of_type('gene'):
            gene_contig = gene.chrom
            if gene_contig == contig_id:
                id_gene = gene.id
                start_position = gene.start - 1
                end_position = gene.end
                strand = strand_change(gene.strand)
                new_feature_gene = sf.SeqFeature(sf.FeatureLocation(
                    start_position, end_position, strand),
                                                 type="gene")
                new_feature_gene.qualifiers['locus_tag'] = id_gene
                # Add gene information to contig record.
                record.features.append(new_feature_gene)

                # Search and add RNAs.
                gene_informations = [
                    gene, id_gene, start_position, end_position, strand
                ]
                record = search_and_add_RNA(gff_database, gene_informations,
                                            record, 'mRNA')

                record = search_and_add_RNA(gff_database, gene_informations,
                                            record, 'tRNA')

                record = search_and_add_RNA(gff_database, gene_informations,
                                            record, 'ncRNA')

                record = search_and_add_RNA(gff_database, gene_informations,
                                            record, 'lncRNA')

                # Search for pseudogene and add them.
                record = search_and_add_pseudogene(gff_database, gene, record,
                                                   df_exons, gene_protein_seq)

                # Create CDS using exons, if no exon use gene information
                location_exons = []

                # Use parent mRNA in gff to find CDS.
                # With this we take the isoform of gene.
                for mrna in gff_database.children(gene,
                                                  featuretype="mRNA",
                                                  order_by='start'):
                    mrna_id = mrna.id
                    # Select exon corresponding to the gene.
                    # Then iterate for each exon and extract information.
                    df_temp = df_exons[df_exons['gene_id'] == mrna_id]
                    for _, row in df_temp.iterrows():
                        new_feature_location_exons = sf.FeatureLocation(
                            row['start'], row['end'], row['strand'])
                        location_exons.append(new_feature_location_exons)
                    if location_exons and len(location_exons) >= 2:
                        exon_compound_locations = sf.CompoundLocation(
                            location_exons, operator='join')

                        new_feature_cds = sf.SeqFeature(
                            exon_compound_locations, type='CDS')
                    else:
                        new_feature_cds = sf.SeqFeature(sf.FeatureLocation(
                            start_position, end_position, strand),
                                                        type="CDS")

                    new_feature_cds.qualifiers[
                        'translation'] = gene_protein_seq[mrna_id]
                    new_feature_cds.qualifiers['locus_tag'] = id_gene

                    # Add GO annotation according to the namespace.
                    if mrna_id in annot_GOs:
                        gene_gos = re.split(';|,', annot_GOs[mrna_id])
                        if gene_gos != [""]:
                            go_components = []
                            go_functions = []
                            go_process = []

                            for go in gene_gos:
                                # Check if GO term is not a deprecated one.
                                # If yes take the corresponding one in alternative GO.
                                if go not in go_namespaces:
                                    go_test = go_alternatives[go]
                                else:
                                    go_test = go
                                if go_namespaces[
                                        go_test] == 'cellular_component':
                                    go_components.append(go)
                                if go_namespaces[
                                        go_test] == 'molecular_function':
                                    go_functions.append(go)
                                if go_namespaces[
                                        go_test] == 'biological_process':
                                    go_process.append(go)
                            new_feature_cds.qualifiers[
                                'go_component'] = go_components
                            new_feature_cds.qualifiers[
                                'go_function'] = go_functions
                            new_feature_cds.qualifiers[
                                'go_process'] = go_process

                    # Add InterPro annotation.
                    if mrna_id in annot_IPRs:
                        gene_iprs = re.split(';|,', annot_IPRs[mrna_id])
                        if gene_iprs != [""]:
                            new_feature_cds.qualifiers['db_xref'] = [
                                "InterPro:" + interpro
                                for interpro in gene_iprs
                            ]

                    # Add EC annotation.
                    if mrna_id in annot_ECs:
                        gene_ecs = re.split(';|,', annot_ECs[mrna_id])
                        if gene_ecs != [""]:
                            new_feature_cds.qualifiers['EC_number'] = [
                                ec.replace('ec:', '') for ec in gene_ecs
                            ]

                    # Add CDS information to contig record
                    record.features.append(new_feature_cds)

        seq_objects.append(record)

    # Create Genbank with the list of SeqRecord.
    SeqIO.write(seq_objects, gbk_out, 'genbank')