def from_TriTrypDB(name, gff, fasta, tax, tmp_dir=None): genome = {x.id: x for x in sp(fasta)} from BCBio import GFF import re annotation = list(GFF.parse(gff, base_dict=genome)) contig = annotation[0] seqCol = BioDocFactory.create_genome(name, contig, tax, Tax) seqCol.save() if not tmp_dir: tmp_dir = "/tmp/" + name + "/" mkdir(tmp_dir) gene_ids = {} with tqdm(annotation) as pbar: for contig in pbar: pbar.set_description(contig.id) if len(contig.seq) > 15000000: contig.seq = "" contigDoc, gene_ids2 = BioDocFactory.create_contig( contig, seqCol, type_map={ "rRNA": "rRNA", "ncRNA": "ncRNA", NCBI.f_mRNA: "gene", "exon": "exon", "gene": "gene", NCBI.f_CDS: NCBI.f_CDS, "rRNA": "rRNA", "tRNA": "tRNA", "tmRNA": "tmRNA", "snoRNA": "snoRNA", "three_prime_UTR": "three_prime_UTR", "five_prime_UTR": "five_prime_UTR" }) gene_ids.update(gene_ids2) contigDoc.save() prots = [] with tqdm(tritryp_protein_iter(annotation)) as pbar: for (protein, cds_f) in pbar: protDoc = Protein(seq=str(protein.seq), name=protein.id) if "description" in cds_f.qualifiers: protein_description = cds_f.qualifiers['description'][0] elif "Note" in cds_f.qualifiers: protein_description = cds_f.qualifiers['Note'][0] elif "product" in cds_f.qualifiers: protein_description = cds_f.qualifiers['product'][0] else: protein_description = "" protDoc.description = protein_description gos = [] if "Ontology_term" in cds_f.qualifiers: gos = [ x.lower() for x in cds_f.qualifiers["Ontology_term"] if "GO:" in x and ( x not in ["GO:0008150", "GO:0003674", "GO:0005575"]) ] note = cds_f.qualifiers["Note"][0].split( " ")[0] if "Note" in cds_f.qualifiers else "" ecs = ["ec:" + note] if re.match( '^[0-9]+\.[0-9\-]+\.[0-9\-]+\.[0-9\-]$', note) else [] ontologies = list(set(ecs + gos)) protDoc.gene = [protein.id] protDoc.ontologies = ontologies protDoc.alias = [protein.id] if len(protDoc.seq) > 30000: raise Exception("No existen proteinas tan largas...") protDoc.gene_id = gene_ids[protein.id] protDoc.organism = name protDoc.auth = str(BioMongoDB.demo_id) protDoc.seq_collection_id = seqCol prots.append(protDoc) if pbar.n and ((pbar.n % 1000) == 0): Protein.objects.insert(prots) prots = [] if prots: Protein.objects.insert(prots) _common_annotations(name, tmp_dir)
def create_protein(cls, seqrecord, feature, exons=[]): alias = cls.alias(feature) locus_tag = feature.qualifiers["locus_tag"][0] protein_name = locus_tag seq = str(seqrecord.seq) # if "translation" in feature.qualifiers: # seq = feature.qualifiers["translation"][0] # else: # if exons: # seq = str(reduce(Seq.__add__, [exon.extract(seqrecord.seq) for exon in exons]).translate()) # else: # seq = str(feature.extract(Seq(str(seqrecord.seq))).translate()) if "gene_symbol_source" in feature.qualifiers: try: int(feature.qualifiers["gene_symbol_source"][0]) protein_name = "" except: protein_name = feature.qualifiers['gene_symbol'][0] # elif "product" in mrna_feature.qualifiers: # protein_name = mrna_feature.qualifiers['product'][0] p = Protein(seq=seq, name=protein_name) if "description" in feature.qualifiers: protein_description = feature.qualifiers['description'][0] elif "Note" in feature.qualifiers: protein_description = feature.qualifiers['Note'][0] elif "product" in feature.qualifiers: protein_description = feature.qualifiers['product'][0] else: protein_description = "" bp = BioProperty( _type="annotation", description="homolog proteins and sources used for the annotation") if "protein_id" in feature.qualifiers: bp.ncbi_protein_id = feature.qualifiers["protein_id"][0] if "db_xref" in feature.qualifiers: bp.ncbi_db_xref = feature.qualifiers["db_xref"][0] if "Dbxref" in feature.qualifiers: bp.ncbi_db_xref = feature.qualifiers["Dbxref"][0] if "top_cog_hit" in feature.qualifiers: bp.cog = feature.qualifiers["top_cog_hit"][0] if "gene_symbol_source" in feature.qualifiers: bp.source = feature.qualifiers["gene_symbol_source"][0] if "gene_product_name_source" in feature.qualifiers: bp.source = feature.qualifiers["gene_product_name_source"][0] ecs = [] if "EC" in feature.qualifiers: ecs = ecs + [ x.lower() if x.lower().startswith("ec") else "ec:" + x.lower() for x in feature.qualifiers["EC"] if "." in x ] if "EC_number" in feature.qualifiers: ecs = ecs + [ x.lower() if x.lower().startswith("ec") else "ec:" + x.lower() for x in feature.qualifiers["EC_number"] if "." in x ] if "Dbxref" in feature.qualifiers: ecs = ecs + [ x.lower() if x.lower().startswith("ec") else "ec:" + x.lower() for x in feature.qualifiers["Dbxref"] if "." in x ] gos = [] if "db_xref" in feature.qualifiers: gos = gos + [ x.lower() for x in feature.qualifiers["db_xref"] if "GO:" in x and (x not in ["GO:0008150", "GO:0003674", "GO:0005575"]) ] if "GO" in feature.qualifiers: gos = gos + [ x.lower() for x in feature.qualifiers["GO"] if "GO:" in x and (x not in ["GO:0008150", "GO:0003674", "GO:0005575"]) ] if "Ontology_term" in feature.qualifiers: gos = [ x.lower() for x in feature.qualifiers["Ontology_term"] if "GO:" in x and ( x not in ["GO:0008150", "GO:0003674", "GO:0005575"]) ] ontologies = list(set(ecs + gos)) p.gene = list([locus_tag, protein_name ]) if locus_tag != protein_name else [locus_tag] p.name = protein_name p.description = protein_description p.ontologies = ontologies p.properties = [bp] p.alias = alias return p