Ejemplo n.º 1
0
def from_TriTrypDB(name, gff, fasta, tax, tmp_dir=None):
    genome = {x.id: x for x in sp(fasta)}
    from BCBio import GFF
    import re
    annotation = list(GFF.parse(gff, base_dict=genome))
    contig = annotation[0]

    seqCol = BioDocFactory.create_genome(name, contig, tax, Tax)
    seqCol.save()

    if not tmp_dir:
        tmp_dir = "/tmp/" + name + "/"
    mkdir(tmp_dir)
    gene_ids = {}
    with tqdm(annotation) as pbar:
        for contig in pbar:
            pbar.set_description(contig.id)
            if len(contig.seq) > 15000000:
                contig.seq = ""
            contigDoc, gene_ids2 = BioDocFactory.create_contig(
                contig,
                seqCol,
                type_map={
                    "rRNA": "rRNA",
                    "ncRNA": "ncRNA",
                    NCBI.f_mRNA: "gene",
                    "exon": "exon",
                    "gene": "gene",
                    NCBI.f_CDS: NCBI.f_CDS,
                    "rRNA": "rRNA",
                    "tRNA": "tRNA",
                    "tmRNA": "tmRNA",
                    "snoRNA": "snoRNA",
                    "three_prime_UTR": "three_prime_UTR",
                    "five_prime_UTR": "five_prime_UTR"
                })
            gene_ids.update(gene_ids2)
            contigDoc.save()
    prots = []
    with tqdm(tritryp_protein_iter(annotation)) as pbar:
        for (protein, cds_f) in pbar:

            protDoc = Protein(seq=str(protein.seq), name=protein.id)

            if "description" in cds_f.qualifiers:
                protein_description = cds_f.qualifiers['description'][0]
            elif "Note" in cds_f.qualifiers:
                protein_description = cds_f.qualifiers['Note'][0]
            elif "product" in cds_f.qualifiers:
                protein_description = cds_f.qualifiers['product'][0]
            else:
                protein_description = ""

            protDoc.description = protein_description

            gos = []
            if "Ontology_term" in cds_f.qualifiers:
                gos = [
                    x.lower() for x in cds_f.qualifiers["Ontology_term"]
                    if "GO:" in x and (
                        x not in ["GO:0008150", "GO:0003674", "GO:0005575"])
                ]

            note = cds_f.qualifiers["Note"][0].split(
                " ")[0] if "Note" in cds_f.qualifiers else ""
            ecs = ["ec:" + note] if re.match(
                '^[0-9]+\.[0-9\-]+\.[0-9\-]+\.[0-9\-]$', note) else []
            ontologies = list(set(ecs + gos))

            protDoc.gene = [protein.id]
            protDoc.ontologies = ontologies
            protDoc.alias = [protein.id]

            if len(protDoc.seq) > 30000:
                raise Exception("No existen proteinas tan largas...")
            protDoc.gene_id = gene_ids[protein.id]
            protDoc.organism = name
            protDoc.auth = str(BioMongoDB.demo_id)
            protDoc.seq_collection_id = seqCol
            prots.append(protDoc)
            if pbar.n and ((pbar.n % 1000) == 0):
                Protein.objects.insert(prots)
                prots = []
    if prots:
        Protein.objects.insert(prots)

    _common_annotations(name, tmp_dir)
Ejemplo n.º 2
0
    def create_protein(cls, seqrecord, feature, exons=[]):

        alias = cls.alias(feature)

        locus_tag = feature.qualifiers["locus_tag"][0]
        protein_name = locus_tag
        seq = str(seqrecord.seq)
        # if "translation" in feature.qualifiers:
        #     seq = feature.qualifiers["translation"][0]
        # else:
        #     if exons:
        #         seq = str(reduce(Seq.__add__, [exon.extract(seqrecord.seq) for exon in exons]).translate())
        #     else:
        #         seq = str(feature.extract(Seq(str(seqrecord.seq))).translate())

        if "gene_symbol_source" in feature.qualifiers:
            try:
                int(feature.qualifiers["gene_symbol_source"][0])
                protein_name = ""
            except:
                protein_name = feature.qualifiers['gene_symbol'][0]
        #         elif "product" in mrna_feature.qualifiers:
        #             protein_name = mrna_feature.qualifiers['product'][0]

        p = Protein(seq=seq, name=protein_name)

        if "description" in feature.qualifiers:
            protein_description = feature.qualifiers['description'][0]
        elif "Note" in feature.qualifiers:
            protein_description = feature.qualifiers['Note'][0]
        elif "product" in feature.qualifiers:
            protein_description = feature.qualifiers['product'][0]
        else:
            protein_description = ""

        bp = BioProperty(
            _type="annotation",
            description="homolog proteins and sources used for the annotation")

        if "protein_id" in feature.qualifiers:
            bp.ncbi_protein_id = feature.qualifiers["protein_id"][0]

        if "db_xref" in feature.qualifiers:
            bp.ncbi_db_xref = feature.qualifiers["db_xref"][0]

        if "Dbxref" in feature.qualifiers:
            bp.ncbi_db_xref = feature.qualifiers["Dbxref"][0]

        if "top_cog_hit" in feature.qualifiers:
            bp.cog = feature.qualifiers["top_cog_hit"][0]
        if "gene_symbol_source" in feature.qualifiers:
            bp.source = feature.qualifiers["gene_symbol_source"][0]
        if "gene_product_name_source" in feature.qualifiers:
            bp.source = feature.qualifiers["gene_product_name_source"][0]
        ecs = []

        if "EC" in feature.qualifiers:
            ecs = ecs + [
                x.lower() if x.lower().startswith("ec") else "ec:" + x.lower()
                for x in feature.qualifiers["EC"] if "." in x
            ]

        if "EC_number" in feature.qualifiers:
            ecs = ecs + [
                x.lower() if x.lower().startswith("ec") else "ec:" + x.lower()
                for x in feature.qualifiers["EC_number"] if "." in x
            ]

        if "Dbxref" in feature.qualifiers:
            ecs = ecs + [
                x.lower() if x.lower().startswith("ec") else "ec:" + x.lower()
                for x in feature.qualifiers["Dbxref"] if "." in x
            ]

        gos = []

        if "db_xref" in feature.qualifiers:
            gos = gos + [
                x.lower()
                for x in feature.qualifiers["db_xref"] if "GO:" in x and
                (x not in ["GO:0008150", "GO:0003674", "GO:0005575"])
            ]
        if "GO" in feature.qualifiers:
            gos = gos + [
                x.lower() for x in feature.qualifiers["GO"] if "GO:" in x and
                (x not in ["GO:0008150", "GO:0003674", "GO:0005575"])
            ]
        if "Ontology_term" in feature.qualifiers:
            gos = [
                x.lower() for x in feature.qualifiers["Ontology_term"]
                if "GO:" in x and (
                    x not in ["GO:0008150", "GO:0003674", "GO:0005575"])
            ]

        ontologies = list(set(ecs + gos))

        p.gene = list([locus_tag, protein_name
                       ]) if locus_tag != protein_name else [locus_tag]

        p.name = protein_name
        p.description = protein_description
        p.ontologies = ontologies
        p.properties = [bp]

        p.alias = alias

        return p