Beispiel #1
0
def _retrieve_features(adaptor, primary_id):
    sql = "SELECT seqfeature_id, type.name, rank" \
          " FROM seqfeature join term type on (type_term_id = type.term_id)" \
          " WHERE bioentry_id = %s" \
          " ORDER BY rank"
    results = adaptor.execute_and_fetchall(sql, (primary_id,))
    seq_feature_list = []
    for seqfeature_id, seqfeature_type, seqfeature_rank in results:
        # Get qualifiers [except for db_xref which is stored separately]
        qvs = adaptor.execute_and_fetchall(
            "SELECT name, value"
            " FROM seqfeature_qualifier_value  join term using (term_id)"
            " WHERE seqfeature_id = %s"
            " ORDER BY rank", (seqfeature_id,))
        qualifiers = {}
        for qv_name, qv_value in qvs:
            qualifiers.setdefault(qv_name, []).append(qv_value)
        # Get db_xrefs [special case of qualifiers]
        qvs = adaptor.execute_and_fetchall(
            "SELECT dbxref.dbname, dbxref.accession"
            " FROM dbxref join seqfeature_dbxref using (dbxref_id)"
            " WHERE seqfeature_dbxref.seqfeature_id = %s"
            " ORDER BY rank", (seqfeature_id,))
        for qv_name, qv_value in qvs:
            value = "%s:%s" % (qv_name, qv_value)
            qualifiers.setdefault("db_xref", []).append(value)
        # Get locations
        results = adaptor.execute_and_fetchall(
            "SELECT location_id, start_pos, end_pos, strand"
            " FROM location"
            " WHERE seqfeature_id = %s"
            " ORDER BY rank", (seqfeature_id,))
        locations = []
        # convert to Python standard form
        # Convert strand = 0 to strand = None
        # re: comment in Loader.py:
        # Biopython uses None when we don't know strand information but
        # BioSQL requires something (non null) and sets this as zero
        # So we'll use the strand or 0 if Biopython spits out None
        for location_id, start, end, strand in results:
            if start:
                start -= 1
            if strand == 0:
                strand = None
            if strand not in (+1, -1, None):
                raise ValueError("Invalid strand %s found in database for "
                                 "seqfeature_id %s" % (strand, seqfeature_id))
            if end < start:
                import warnings
                from Bio import BiopythonWarning
                warnings.warn("Inverted location start/end (%i and %i) for "
                              "seqfeature_id %s" % (start, end, seqfeature_id),
                              BiopythonWarning)
            locations.append((location_id, start, end, strand))
        # Get possible remote reference information
        remote_results = adaptor.execute_and_fetchall(
            "SELECT location_id, dbname, accession, version"
            " FROM location join dbxref using (dbxref_id)"
            " WHERE seqfeature_id = %s", (seqfeature_id,))
        lookup = {}
        for location_id, dbname, accession, version in remote_results:
            if version and version != "0":
                v = "%s.%s" % (accession, version)
            else:
                v = accession
            # subfeature remote location db_ref are stored as a empty string when
            # not present
            if dbname == "":
                dbname = None
            lookup[location_id] = (dbname, v)

        feature = SeqFeature.SeqFeature(type=seqfeature_type)
        # Store the key as a private property
        feature._seqfeature_id = seqfeature_id
        feature.qualifiers = qualifiers
        if len(locations) == 0:
            pass
        elif len(locations) == 1:
            location_id, start, end, strand = locations[0]
            # See Bug 2677, we currently don't record the location_operator
            # For consistency with older versions Biopython, default to "".
            feature.location_operator = \
                _retrieve_location_qualifier_value(adaptor, location_id)
            dbname, version = lookup.get(location_id, (None, None))
            feature.location = SeqFeature.FeatureLocation(start, end)
            feature.strand = strand
            feature.ref_db = dbname
            feature.ref = version
        else:
            sub_features = feature.sub_features
            assert sub_features == []
            for location in locations:
                location_id, start, end, strand = location
                dbname, version = lookup.get(location_id, (None, None))
                subfeature = SeqFeature.SeqFeature()
                subfeature.type = seqfeature_type
                subfeature.location = SeqFeature.FeatureLocation(start, end)
                # subfeature.location_operator = \
                #    _retrieve_location_qualifier_value(adaptor, location_id)
                subfeature.strand = strand
                subfeature.ref_db = dbname
                subfeature.ref = version
                sub_features.append(subfeature)
            # Locations are in order, but because of remote locations for
            # sub-features they are not necessarily in numerical order:
            strands = set(sf.strand for sf in sub_features)
            if len(strands) == 1 and -1 in strands:
                # Evil hack time for backwards compatibility
                # TODO - Check if BioPerl and (old) Biopython did the same,
                # we may have an existing incompatibility lurking here...
                locs = [f.location for f in sub_features[::-1]]
            else:
                # All forward, or mixed strands
                locs = [f.location for f in sub_features]
            feature.location = SeqFeature.CompoundLocation(
                locs, seqfeature_type)
            # TODO - See Bug 2677 - we don't yet record location_operator,
            # so for consistency with older versions of Biopython default
            # to assuming its a join.
            feature.location_operator = "join"
        seq_feature_list.append(feature)

    return seq_feature_list
Beispiel #2
0
def gff_to_gbk(genome_fasta, prot_fasta, annot_table, gff_file, species_name,
               gbk_out):
    """
    From a genome fasta (containing each contigs of the genome),
    a protein fasta (containing each protein sequence),
    an annotation table (containing gene name associated with GO terms, InterPro and EC),
    a gff file (containing gene, exon, mRNA, ncRNA, tRNA),
    a contig information table (containing species name, taxon ID, ..)
    create a genbank file.
    """

    print('Creating GFF database (gffutils)')
    # Create the gff database file.
    # gffutils use sqlite3 file-based database to access data inside GFF.
    # ':memory:' ask gffutils to keep database in memory instead of writting in a file.
    gff_database = gffutils.create_db(gff_file,
                                      ':memory:',
                                      force=True,
                                      keep_order=True,
                                      merge_strategy='merge',
                                      sort_attribute_values=True)

    # Length of your gene ID.
    # Catch it in the GFF database.
    # It's pretty dumb as we go into a loop for one information.
    # But I don't find another way to catch the length of gene_id.
    length_gene_id = 0

    for gene in gff_database.features_of_type('gene'):
        length_gene_id = len(gene.id.replace('gene:', ''))
        break

    # Get the longest contig ID to check if all contig IDs have the
    # same length, if not add 0 (at the supposed position of the number).
    longest_contig_id = ""

    for contig_for_length_id in gff_database.features_of_type(
            'sequence_assembly'):
        if len(longest_contig_id) < len(contig_for_length_id.id):
            longest_contig_id = contig_for_length_id.id

    print('Formatting fasta and annotation file')
    # Dictionary with scaffold/chromosome id as key and sequence as value.
    contig_seqs = OrderedDict()

    for record in SeqIO.parse(genome_fasta, "fasta"):
        id_contig = record.id
        contig_seqs[id_contig] = record.seq

    # Dictionary with gene id as key and protein sequence as value.
    gene_protein_seq = {}

    for record in SeqIO.parse(prot_fasta, "fasta"):
        gene_protein_seq[record.id] = record.seq

    # Create a taxonomy dictionary querying the EBI.
    species_informations = create_taxonomic_data(species_name)

    # Read a tsv file containing GO terms, Interpro and EC associated with gene name.
    mapping_data = pa.read_csv(annot_table, sep='\t')
    mapping_data.replace(np.nan, '', inplace=True)

    gene_column, go_column, ec_column, ipr_column = find_column_of_interest(
        mapping_data)

    mapping_data.set_index(gene_column, inplace=True)
    # Dictionary with gene id as key and GO terms/Interpro/EC as value.
    annot_GOs = mapping_data[go_column].to_dict()
    annot_IPRs = mapping_data[ipr_column].to_dict()
    annot_ECs = mapping_data[ec_column].to_dict()

    # Query Gene Ontology to extract namespaces and alternative IDs.
    df_go_namespace, df_go_alternative = create_GO_dataframes()
    # Dictionary GO id as term and GO namespace as value.
    df_go_namespace.set_index('GO', inplace=True)
    go_namespaces = df_go_namespace['namespace'].to_dict()

    # Dictionary GO id as term and GO alternatives id as value.
    df_go_alternative.set_index('GO', inplace=True)
    go_alternatives = df_go_alternative['alternative_GO'].to_dict()

    # Create a dataframe containing each exon with informations (gene, start, end and strand)
    df_exons = pa.DataFrame(
        columns=['exon_id', 'gene_id', 'start', 'end', 'strand'])

    print('Searching for exons')

    temporary_datas = []

    # Search for all exons in gff database and extract start position (have to minus one to get the right position)
    # the end position, the strand (have to change from str to int) and the gene ID.
    # Then add it to a list of dictionary that will be added to the dataframe.
    for exon in gff_database.features_of_type('exon'):
        start_position = exon.start - 1
        end_position = exon.end
        strand = strand_change(exon.strand)

        gene_id = exon.id.replace('exon:', '')[:-2]
        temporary_datas.append({
            'exon_id': exon.id,
            'gene_id': gene_id,
            'start': start_position,
            'end': end_position,
            'strand': strand
        })

    df_exons = df_exons.append(temporary_datas)

    # All SeqRecord objects will be stored in a list and then give to the SeqIO writer to create the genbank.
    seq_objects = []

    print('Assembling Genbank informations')

    # Iterate through each contig.
    # Then iterate through gene and throug RNA linked with the gene.
    # Then look if protein informations are available.
    for contig_id in sorted(contig_seqs):
        # Data for each contig.
        record = contig_info(contig_id, contig_seqs[contig_id],
                             species_informations)
        for gene in gff_database.features_of_type('gene'):
            gene_contig = gene.chrom
            if gene_contig == contig_id:
                id_gene = gene.id
                start_position = gene.start - 1
                end_position = gene.end
                strand = strand_change(gene.strand)
                new_feature_gene = sf.SeqFeature(sf.FeatureLocation(
                    start_position, end_position, strand),
                                                 type="gene")
                new_feature_gene.qualifiers['locus_tag'] = id_gene
                # Add gene information to contig record.
                record.features.append(new_feature_gene)

                # Search and add RNAs.
                gene_informations = [
                    gene, id_gene, start_position, end_position, strand
                ]
                record = search_and_add_RNA(gff_database, gene_informations,
                                            record, 'mRNA')

                record = search_and_add_RNA(gff_database, gene_informations,
                                            record, 'tRNA')

                record = search_and_add_RNA(gff_database, gene_informations,
                                            record, 'ncRNA')

                record = search_and_add_RNA(gff_database, gene_informations,
                                            record, 'lncRNA')

                # Search for pseudogene and add them.
                record = search_and_add_pseudogene(gff_database, gene, record,
                                                   df_exons, gene_protein_seq)

                # Create CDS using exons, if no exon use gene information
                location_exons = []

                # Use parent mRNA in gff to find CDS.
                # With this we take the isoform of gene.
                for mrna in gff_database.children(gene,
                                                  featuretype="mRNA",
                                                  order_by='start'):
                    mrna_id = mrna.id
                    # Select exon corresponding to the gene.
                    # Then iterate for each exon and extract information.
                    df_temp = df_exons[df_exons['gene_id'] == mrna_id]
                    for _, row in df_temp.iterrows():
                        new_feature_location_exons = sf.FeatureLocation(
                            row['start'], row['end'], row['strand'])
                        location_exons.append(new_feature_location_exons)
                    if location_exons and len(location_exons) >= 2:
                        exon_compound_locations = sf.CompoundLocation(
                            location_exons, operator='join')

                        new_feature_cds = sf.SeqFeature(
                            exon_compound_locations, type='CDS')
                    else:
                        new_feature_cds = sf.SeqFeature(sf.FeatureLocation(
                            start_position, end_position, strand),
                                                        type="CDS")

                    new_feature_cds.qualifiers[
                        'translation'] = gene_protein_seq[mrna_id]
                    new_feature_cds.qualifiers['locus_tag'] = id_gene

                    # Add GO annotation according to the namespace.
                    if mrna_id in annot_GOs:
                        gene_gos = re.split(';|,', annot_GOs[mrna_id])
                        if gene_gos != [""]:
                            go_components = []
                            go_functions = []
                            go_process = []

                            for go in gene_gos:
                                # Check if GO term is not a deprecated one.
                                # If yes take the corresponding one in alternative GO.
                                if go not in go_namespaces:
                                    go_test = go_alternatives[go]
                                else:
                                    go_test = go
                                if go_namespaces[
                                        go_test] == 'cellular_component':
                                    go_components.append(go)
                                if go_namespaces[
                                        go_test] == 'molecular_function':
                                    go_functions.append(go)
                                if go_namespaces[
                                        go_test] == 'biological_process':
                                    go_process.append(go)
                            new_feature_cds.qualifiers[
                                'go_component'] = go_components
                            new_feature_cds.qualifiers[
                                'go_function'] = go_functions
                            new_feature_cds.qualifiers[
                                'go_process'] = go_process

                    # Add InterPro annotation.
                    if mrna_id in annot_IPRs:
                        gene_iprs = re.split(';|,', annot_IPRs[mrna_id])
                        if gene_iprs != [""]:
                            new_feature_cds.qualifiers['db_xref'] = [
                                "InterPro:" + interpro
                                for interpro in gene_iprs
                            ]

                    # Add EC annotation.
                    if mrna_id in annot_ECs:
                        gene_ecs = re.split(';|,', annot_ECs[mrna_id])
                        if gene_ecs != [""]:
                            new_feature_cds.qualifiers['EC_number'] = [
                                ec.replace('ec:', '') for ec in gene_ecs
                            ]

                    # Add CDS information to contig record
                    record.features.append(new_feature_cds)

        seq_objects.append(record)

    # Create Genbank with the list of SeqRecord.
    SeqIO.write(seq_objects, gbk_out, 'genbank')