def _import_protein_details(self, ms_job): for row in self.protein_details: if hasattr(self, "notify_progress"): outstr = "EasyProt: Importing Protein Details ({})".format( row["ID"]) self.current += 1 self.notify_progress(current=self.current, total=self.total, message=outstr) ref_protein = cmodels.MassSpectrometryProtein.objects.for_species( self.species).for_wid(slugify(row["ID"])) coordinate, length = map( lambda x: int(x), row["Position (to mature prot.)"].split("-")) cmodels.MassSpectrometryProteinDetail.objects.get_or_create_with_revision( self.detail, protein=ref_protein, sequence=row["Sequence"], sequence_ptm=row["Sequence + PTMs"], coordinate=coordinate, length=length - coordinate, proteotypic=row["Proteotypic"], zscore=row["z-score"], delta_mass=row["Delta Mass (ppm)"], mass=row["Experimental Mass (m/z)"], charge=row["Charge"], retention_time=row["Retention Time (min)"], theoretical_mass=row["Theoretical Mass (Da)"], missed_cleavages=row["Missed Cleavages"])
def handle(self, *args, **options): if not options["wid"]: raise CommandError("wid argument is mandatory") if not options["reason"]: raise CommandError("reason is mandatory") wid = slugify(options["wid"]) reason = options["reason"] if options["wid"] != wid: raise CommandError( "Wid {} contained invalid characters. Only letters, numbers and _ are allowed" .format(options["wid"])) try: species_obj = Species.objects.get(wid=wid) except: if self.verify_species_exists: raise CommandError("Species {} not found".format(wid)) else: species_obj = Species(wid=wid) if not options["user"]: options["user"] = "******" revdetail = RevisionDetail() revdetail.user = UserProfile.objects.get( user__username=options["user"]) revdetail.reason = reason revdetail.save() self.handle_command(species_obj, revdetail, *args, **options)
def _import_target_peptides(self, ms_job): target_type = cmodels.Type.objects.for_wid("Target-Peptide", create=True) target_type.species = self.species target_type.save(self.detail) for i, item in enumerate(self.target_peptides): if hasattr(self, "notify_progress"): outstr = "EasyProt: Importing Target Peptide ({})".format( item["Matched Proteins"]) self.current += 1 self.notify_progress(current=self.current, total=self.total, message=outstr) peptide = cmodels.Peptide.objects.for_species( self.species).for_wid("{}-{}".format( i + 1, slugify(item["Matched Proteins"])), create=True) peptide.parent = ms_job peptide.sequence = item["Sequence"] peptide.length = len(item["Sequence"]) peptide.proteotypic = item["Proteotypic"] peptide.charge = item["Charge"] peptide.mass = item["m/z"] peptide.zscore = item["zscore"] peptide.retention_time = item["RT"] peptide.species = self.species peptide.save(self.detail) peptide.type.add(target_type) for protein in item["Matched Proteins"].split(","): prot = cmodels.EntryBasicTextData.objects.get_or_create_with_revision( self.detail, value=protein.strip()) peptide.proteins.add(prot)
def handle(self, *args, **options): if not options["wid"]: raise CommandError("wid argument is mandatory") if not options["reason"]: raise CommandError("reason is mandatory") wid = slugify(options["wid"]) reason = options["reason"] if options["wid"] != wid: raise CommandError("Wid {} contained invalid characters. Only letters, numbers and _ are allowed".format(options["wid"])) try: species_obj = Species.objects.get(wid = wid) except: if self.verify_species_exists: raise CommandError("Species {} not found".format(wid)) else: species_obj = Species(wid = wid) if not options["user"]: options["user"] = "******" revdetail = RevisionDetail() revdetail.user = UserProfile.objects.get(user__username = options["user"]) revdetail.reason = reason revdetail.save() self.handle_command(species_obj, revdetail, *args, **options)
def try_slugify(self, name, not_slug): slug = slugify(not_slug) if slug != not_slug: raise ValueError("{} {} contained invalid characters. Only letters, numbers and _ are allowed".format(name, not_slug)) return slug
def _import_protein_summary(self, ms_job): # Import the protein data from the file for row in self.protein_summary: if hasattr(self, "notify_progress"): outstr = "EasyProt: Importing Protein Summary ({})".format( row["ID"]) self.current += 1 self.notify_progress(current=self.current, total=self.total, message=outstr) protein = cmodels.MassSpectrometryProtein.objects.for_species( self.species).for_wid(slugify(row["ID"]), create=True) """:type: cmodels.MassSpectrometryProtein""" protein.comments = row["Description"] uniprot = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source="UniProt", xid=row["AC"]) protein.score = row["Protein Score"] protein.coverage = row["% Coverage"] protein.sequence = row["Protein Seq"] protein.length = len(protein.sequence) # #PSMs -> Peptide spectrum match -> Entries in Details # #Peptides -> Count number of different sequences(?) in Details protein.pi = row["Protein PI"] protein.mass = row["Protein Mass (Da)"] protein.parent = ms_job protein.species = self.species protein.save(self.detail) protein.cross_references.add(uniprot) # Extract all GO terms splitted to a generator containing list # [name, type (GO), identifier] go_term_row = row["GO terms"] if go_term_row: go_terms = (x.groups() for x in (re.match(r"^(.*) \((.*):(.*)\)$", x) for x in go_term_row.split(";") if x)) for name, typ, identifier in go_terms: go = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source=typ, xid=identifier) protein.cross_references.add(go) # No field for name :/ if row["#Ambiguous Prots"] > 0: for amb_protein in row["Ambiguous Prots"].split(","): prot = cmodels.EntryBasicTextData.objects.get_or_create_with_revision( self.detail, value=amb_protein.strip()) protein.ambiguous.add(prot) if row["#Sub-Prots"] > 0: for sub_protein in row["Sub-Prots"].split(","): prot = cmodels.EntryBasicTextData.objects.get_or_create_with_revision( self.detail, value=sub_protein.strip()) protein.sub.add(prot)
def parse(self, handle): if hasattr(self, "notify_progress"): self.notify_progress(current=0, total=1, message="Parsing InterProScan file...") xml = handle.read() # Remove xmlns namespace, makes working with ElementTree more complicated xml = re.sub(' xmlns="[^"]+"', '', xml, count=1) root = ET.fromstring(xml) all_proteins = root.findall("protein") all_proteins_len = len(all_proteins) for i, protein in enumerate(all_proteins): xref = protein.find("xref") wid = xref.get("id").split("|", 2)[0] self.total = all_proteins_len if hasattr(self, "notify_progress"): self.notify_progress(current=i+1, total=self.total, message="Parsing features of {} ({}/{})". format(wid, i+1, self.total)) try: protein_item = cmodels.ProteinMonomer.objects.for_species(self.species).for_wid(wid) self.protein_monomers_cf[protein_item] = [] except ObjectDoesNotExist: # ToDo: Error reporting? continue for matches in protein.findall("matches"): for match in matches: signature = match.find("signature") wid = slugify(signature.get("ac")) cf = cmodels.ChromosomeFeature(wid=wid) cf.name = signature.get("name") or "" cf.comments = signature.get("desc") or "" self.xrefs[wid] = [] self.protein_monomers_cf[protein_item].append(cf) self.types[wid] = match.tag.title() for entry in signature.findall("entry"): for xref in entry: self.xrefs[wid].append([xref.get("id"), xref.get("db")]) self.feature_positions[wid] = [] locations = match.find("locations") for location in locations: start = int(location.get("start")) end = int(location.get("end")) direction = "f" if start > end: start, end = end, start direction = "r" length = end - start self.feature_positions[wid].append({"chromosome": protein_item.gene.chromosome_id, "coordinate": start + protein_item.gene.coordinate, "length": length, "direction": direction})
def _import_jobs_params(self): ms_job = cmodels.MassSpectrometryJob.objects.for_species( self.species).for_wid(slugify( self.export_parameters["jobs"][0][0]), create=True) ms_job.name = ms_job.wid ms_job.species = self.species ms_job.save(self.detail) return ms_job
def parse(self, handle): self.report_progress(current=0, total=1, message="Parsing FASTA file") for record in SeqIO.parse(handle, "fasta"): wid, start, end, description = FastaFeature._parse_header(record.description) wid = slugify(wid) self.data.append({ "wid": wid, "start": int(start), "end": int(end), "description": description })
def parse(self, handle): self.report_progress(current=0, total=1, message="Parsing FASTA file") for record in SeqIO.parse(handle, "fasta"): wid, start, end, description = FastaFeature._parse_header( record.description) wid = slugify(wid) self.data.append({ "wid": wid, "start": int(start), "end": int(end), "description": description })
def _import_protein_summary(self, ms_job): # Import the protein data from the file for row in self.protein_summary: if hasattr(self, "notify_progress"): outstr = "EasyProt: Importing Protein Summary ({})".format(row["ID"]) self.current += 1 self.notify_progress(current=self.current, total=self.total, message=outstr) protein = cmodels.MassSpectrometryProtein.objects.for_species(self.species).for_wid(slugify(row["ID"]), create=True) """:type: cmodels.MassSpectrometryProtein""" protein.comments = row["Description"] uniprot = cmodels.CrossReference.objects.get_or_create_with_revision(self.detail, source="UniProt", xid=row["AC"]) protein.score = row["Protein Score"] protein.coverage = row["% Coverage"] protein.sequence = row["Protein Seq"] protein.length = len(protein.sequence) # #PSMs -> Peptide spectrum match -> Entries in Details # #Peptides -> Count number of different sequences(?) in Details protein.pi = row["Protein PI"] protein.mass = row["Protein Mass (Da)"] protein.parent = ms_job protein.species = self.species protein.save(self.detail) protein.cross_references.add(uniprot) # Extract all GO terms splitted to a generator containing list # [name, type (GO), identifier] go_term_row = row["GO terms"] if go_term_row: go_terms = (x.groups() for x in (re.match(r"^(.*) \((.*):(.*)\)$", x) for x in go_term_row.split(";") if x)) for name, typ, identifier in go_terms: go = cmodels.CrossReference.objects.get_or_create_with_revision(self.detail, source=typ, xid=identifier) protein.cross_references.add(go) # No field for name :/ if row["#Ambiguous Prots"] > 0: for amb_protein in row["Ambiguous Prots"].split(","): prot = cmodels.EntryBasicTextData.objects.get_or_create_with_revision(self.detail, value=amb_protein.strip()) protein.ambiguous.add(prot) if row["#Sub-Prots"] > 0: for sub_protein in row["Sub-Prots"].split(","): prot = cmodels.EntryBasicTextData.objects.get_or_create_with_revision(self.detail, value=sub_protein.strip()) protein.sub.add(prot)
def _import_target_peptides(self, ms_job): target_type = cmodels.Type.objects.for_wid("Target-Peptide", create=True) target_type.species = self.species target_type.save(self.detail) for i, item in enumerate(self.target_peptides): if hasattr(self, "notify_progress"): outstr = "EasyProt: Importing Target Peptide ({})".format(item["Matched Proteins"]) self.current += 1 self.notify_progress(current=self.current, total=self.total, message=outstr) peptide = cmodels.Peptide.objects.for_species(self.species).for_wid("{}-{}".format(i+1, slugify(item["Matched Proteins"])), create=True) peptide.parent = ms_job peptide.sequence = item["Sequence"] peptide.length = len(item["Sequence"]) peptide.proteotypic = item["Proteotypic"] peptide.charge = item["Charge"] peptide.mass = item["m/z"] peptide.zscore = item["zscore"] peptide.retention_time = item["RT"] peptide.species = self.species peptide.save(self.detail) peptide.type.add(target_type) for protein in item["Matched Proteins"].split(","): prot = cmodels.EntryBasicTextData.objects.get_or_create_with_revision(self.detail, value=protein.strip()) peptide.proteins.add(prot)
def _import_jobs_params(self): ms_job = cmodels.MassSpectrometryJob.objects.for_species(self.species).for_wid(slugify(self.export_parameters["jobs"][0][0]), create=True) ms_job.name = ms_job.wid ms_job.species = self.species ms_job.save(self.detail) return ms_job
def parse(self, handle): if hasattr(self, "notify_progress"): self.notify_progress(current=0, total=1, message="Parsing InterProScan file...") xml = handle.read() # Remove xmlns namespace, makes working with ElementTree more complicated xml = re.sub(' xmlns="[^"]+"', '', xml, count=1) root = ET.fromstring(xml) all_proteins = root.findall("protein") all_proteins_len = len(all_proteins) for i, protein in enumerate(all_proteins): xref = protein.find("xref") wid = xref.get("id").split("|", 2)[0] self.total = all_proteins_len if hasattr(self, "notify_progress"): self.notify_progress( current=i + 1, total=self.total, message="Parsing features of {} ({}/{})".format( wid, i + 1, self.total)) try: protein_item = cmodels.ProteinMonomer.objects.for_species( self.species).for_wid(wid) self.protein_monomers_cf[protein_item] = [] except ObjectDoesNotExist: # ToDo: Error reporting? continue for matches in protein.findall("matches"): for match in matches: signature = match.find("signature") wid = slugify(signature.get("ac")) cf = cmodels.ChromosomeFeature(wid=wid) cf.name = signature.get("name") or "" cf.comments = signature.get("desc") or "" self.xrefs[wid] = [] self.protein_monomers_cf[protein_item].append(cf) self.types[wid] = match.tag.title() for entry in signature.findall("entry"): for xref in entry: self.xrefs[wid].append( [xref.get("id"), xref.get("db")]) self.feature_positions[wid] = [] locations = match.find("locations") for location in locations: start = int(location.get("start")) end = int(location.get("end")) direction = "f" if start > end: start, end = end, start direction = "r" length = end - start self.feature_positions[wid].append({ "chromosome": protein_item.gene.chromosome_id, "coordinate": start + protein_item.gene.coordinate, "length": length, "direction": direction })
def apply(self): self.detail.save() self.species.save(self.detail) obj = cmodels.Chromosome if self.is_chromosome else cmodels.Plasmid chromosome = obj.objects.for_species(self.species).for_wid( self.chromosome, create=True) chromosome.name = self.name chromosome.sequence = str( self.record.seq) # Cast needed, otherwise revision-compare fails! chromosome.length = len(self.record.seq) chromosome.species = self.species chromosome.save(self.detail) if self.record.dbxrefs: for xref in self.record.dbxrefs: # BioPython doesnt always properly split the db xrefs xref = xref.split(" ") for x in xref: if ":" in x: source, xid = x.split(":") x = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source=source, xid=xid) chromosome.cross_references.add(x) if "references" in self.annotation: for ref in self.annotation["references"]: # calculate the wid if ref.pubmed_id: wid = "PUB_" + ref.pubmed_id name = "Pubmed #" + ref.pubmed_id elif ref.medline_id: wid = "MED_" + ref.medline_id name = "Pubmed #" + ref.medline_id else: publication = cmodels.PublicationReference.objects.filter( authors__exact=ref.authors, title__exact=ref.title, publication__exact=ref.journal) next_id = 0 if publication.exists(): wid = publication[0].wid name = publication[0].name else: refs = cmodels.PublicationReference.objects.filter( wid__startswith="REF_") if refs.exists(): last = refs.reverse()[0] next_id = int(last.wid[4:], 10) + 1 wid = "REF_" + "%04d" % (next_id) name = "Reference #%04d" % (next_id) else: wid = "REF_0001" name = "Reference #0001" pubref = cmodels.PublicationReference.objects.for_wid( slugify(wid), create=True) pubref.name = name pubref.authors = ref.authors pubref.title = ref.title pubref.publication = ref.journal pubref.species = self.species pubref.save(self.detail) if ref.pubmed_id: xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source="PUBMED", xid=ref.pubmed_id) pubref.cross_references.add(xref) if ref.medline_id: xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source="MEDLINE", xid=ref.medline_id) pubref.cross_references.add(xref) chromosome.publication_references.add(pubref) if "gi" in self.annotation: xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, xid=self.annotation["gi"], source="GI") chromosome.cross_references.add(xref) features = self.record.features if len(features) > 0: if features[0].type == "source": if "db_xref" in features[0].qualifiers: for xref in features[0].qualifiers["db_xref"]: if ":" in xref: source, xid = xref.split(":") xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source=source, xid=xid) chromosome.cross_references.add(xref) gene_features = filter(lambda x: x.type == "gene", features) cds_features = filter( lambda x: x.type in ["CDS", "ncRNA", "rRNA", "tmRNA", "tRNA"], features) gene_map = {} for g in gene_features: if not "locus_tag" in g.qualifiers: self.stderr.write("WARN: " + str(g) + " without locus") continue loci = g.qualifiers["locus_tag"][0] if loci in gene_map: raise ValueError("locus_tag " + loci + " appeared twice") gene_map[loci] = g cds_map = {} for c in cds_features: if not "locus_tag" in c.qualifiers: self.stderr.write("WARN: " + str(c) + " without locus") continue loci = c.qualifiers["locus_tag"][0] if loci in cds_map: raise ValueError("locus_tag " + loci + " appeared twice") if loci in gene_map: cds_map[loci] = c sorted_cds_values = sorted(cds_map.values(), key=lambda x: x.qualifiers["locus_tag"]) for i, v in enumerate(sorted_cds_values): qualifiers = v.qualifiers if not self.species.genetic_code: if "transl_table" in qualifiers: self.species.genetic_code = qualifiers["transl_table"][0] self.species.save(self.detail) g = cmodels.Gene.objects.for_species(self.species).for_wid( slugify(qualifiers["locus_tag"][0]), create=True) if hasattr(self, "notify_progress"): outstr = "Importing Gene %s (%d/%d)" % (g.wid, i + 1, len(cds_map.values())) self.notify_progress(current=i + 1, total=len(cds_map.values()), message=outstr) g.chromosome = chromosome if "gene" in qualifiers: g.name = qualifiers["gene"][0] g.symbol = qualifiers["gene"][0] g.direction = 'f' if v.location.strand == 1 else 'r' # __len__ because len() fails for numbers < 0 # Joins output the wrong length if v.location.__len__() < 0: g.length = v.location.__len__() + len(self.record.seq) else: g.length = len(v.location) g.coordinate = v.location.start + 1 if 'f' else v.location.start if "note" in qualifiers: g.comments = "\n".join(qualifiers["note"]) g.species = self.species g.save(self.detail) if "db_xref" in qualifiers: for xref in qualifiers["db_xref"]: if ":" in xref: source, xid = xref.split(":") xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source=source, xid=xid) g.cross_references.add(xref) if "EC_number" in qualifiers: for ec in qualifiers["EC_number"]: xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source="EC", xid=ec) g.cross_references.add(xref) if "gene_synonym" in qualifiers: for synonym in qualifiers["gene_synonym"]: # Inconsistency: Multiple synonyms appear in one entry, # why don't they split them like for all other items? for syn in synonym.split(";"): obj = cmodels.Synonym.objects.get_or_create_with_revision( self.detail, name=syn.strip()) g.synonyms.add(obj) if "protein_id" in qualifiers: protxref = qualifiers["protein_id"][0] wid = slugify(g.wid + "_Monomer") protein = cmodels.ProteinMonomer.objects.for_species( self.species).for_wid(wid, create=True) if "product" in qualifiers: protein.name = qualifiers["product"][0] xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source="RefSeq", xid=protxref) protein.gene = g protein.species = self.species protein.save(self.detail) protein.cross_references.add(xref) if v.type == "CDS": v.type = "mRNA" t = cmodels.Type.objects.for_wid(wid=slugify(v.type), create=True) t.name = v.type t.species = self.species t.save(self.detail) g.type.add(t) if hasattr(self, "notify_progress"): outstr = "Assigning KEGG pathways" self.notify_progress(current=len(cds_map.values()), total=len(cds_map.values()), message=outstr) cmodels.Pathway.add_kegg_pathway(self.species, self.detail)
def apply(self): self.detail.save() total = len(self.compartments) + len(self.sbml_species) + len(self.reactions) # Compartment importer for i, compartment in enumerate(self.compartments): wid = slugify(compartment.getId()) if compartment.getName(): name = compartment.getName() else: name = wid if hasattr(self, "notify_progress"): out_str = "Importing Compartment %s (%d/%d)" % (wid, i + 1, total) self.notify_progress(current = i+1, total = total, message = out_str) # TODO: compartment.getOutside() not implemented cobj = cmodels.Compartment.objects.for_species(self.species).for_wid(wid, create = True) cobj.name = name cobj.species = self.species cobj.save(self.detail) # Species (= Metabolites) importer for i, specie in enumerate(self.sbml_species): if not self.model.getCompartment(specie.getCompartment()): ##self.stderr.write("WARN: Species {} has invalid compartment {}".format(specie.id, specie.getCompartment())) continue wid = slugify(specie.getId()) if specie.getName(): name = specie.getName() else: name = wid if hasattr(self, "notify_progress"): current = len(self.compartments) + i + 1 out_str = "Importing Metabolite %s (%d/%d)" % (wid, current, total) self.notify_progress(current = current, total = total, message = out_str) # TODO: specie.getBoundaryCondition() not implemented sobj = cmodels.Metabolite.objects.for_species(self.species).for_wid(wid, create = True) sobj.name = name sobj.charge = 0 # TODO sobj.is_hydrophobic = False # TODO sobj.species = self.species sobj.save(self.detail) for i, reaction in enumerate(self.reactions): wid = slugify(reaction.getId()) if reaction.getName(): name = reaction.getName() else: name = wid valid = False if hasattr(self, "notify_progress"): current = len(self.compartments) + len(self.sbml_species) + i + 1 out_str = "Importing Reaction %s (%d/%d)" % (wid, current, total) self.notify_progress(current = current, total = total, message = out_str) # Validation of reactants reactants = map(lambda i: reaction.getReactant(i), range(len(reaction.getListOfReactants()))) products = map(lambda i: reaction.getProduct(i), range(len(reaction.getListOfProducts()))) for reactant in reactants: if not self.model.getSpecies(reactant.getSpecies()): ##self.stderr.write("WARN: Reactant {} has invalid species {}".format(reactant.id, reactant.species)) break else: # Validation of products for product in products: if not self.model.getSpecies(product.getSpecies()): ##self.stderr.write("WARN: Product {} has invalid species {}".format(product.id, product.species)) break else: # Validation passed valid = True if valid: reaction_obj = cmodels.Reaction.objects.for_species(self.species).for_wid(wid, create = True) reaction_obj.name = name reaction_obj.direction = 'r' if reaction.getReversible() else 'f' reaction_obj.is_spontaneous = False # TODO reaction_obj.species = self.species reaction_obj.save(self.detail) for reactant in reactants: #try: # participant_obj = cmodels.ReactionStoichiometryParticipant.objects.get(wid = wid) #except ObjectDoesNotExist: # participant_obj = cmodels.ReactionStoichiometryParticipant(wid = wid) participant_obj = cmodels.ReactionStoichiometryParticipant() participant_obj.molecule = cmodels.Metabolite.objects.for_species(self.species).for_wid(slugify(reactant.getSpecies())) participant_obj.coefficient = -reactant.getStoichiometry() participant_obj.compartment = cmodels.Compartment.objects.for_species(self.species).for_wid(slugify(self.model.getSpecies(reactant.getSpecies()).getCompartment())) participant_obj.save(self.detail) reaction_obj.stoichiometry.add(participant_obj) for product in products: #try: # participant_obj = cmodels.ReactionStoichiometryParticipant.objects.get(wid = wid) #except ObjectDoesNotExist: # participant_obj = cmodels.ReactionStoichiometryParticipant(wid = wid) participant_obj = cmodels.ReactionStoichiometryParticipant() participant_obj.molecule = cmodels.Metabolite.objects.for_species(self.species).for_wid(slugify(product.getSpecies())) participant_obj.coefficient = product.getStoichiometry() participant_obj.compartment = cmodels.Compartment.objects.for_species(self.species).for_wid(slugify(self.model.getSpecies(product.getSpecies()).getCompartment())) participant_obj.detail = self.detail participant_obj.save(self.detail) reaction_obj.stoichiometry.add(participant_obj)
def _import_protein_details(self, ms_job): for row in self.protein_details: if hasattr(self, "notify_progress"): outstr = "EasyProt: Importing Protein Details ({})".format(row["ID"]) self.current += 1 self.notify_progress(current=self.current, total=self.total, message=outstr) ref_protein = cmodels.MassSpectrometryProtein.objects.for_species(self.species).for_wid(slugify(row["ID"])) coordinate, length = map(lambda x: int(x), row["Position (to mature prot.)"].split("-")) cmodels.MassSpectrometryProteinDetail.objects.get_or_create_with_revision( self.detail, protein=ref_protein, sequence=row["Sequence"], sequence_ptm=row["Sequence + PTMs"], coordinate=coordinate, length=length-coordinate, proteotypic=row["Proteotypic"], zscore=row["z-score"], delta_mass=row["Delta Mass (ppm)"], mass=row["Experimental Mass (m/z)"], charge=row["Charge"], retention_time=row["Retention Time (min)"], theoretical_mass=row["Theoretical Mass (Da)"], missed_cleavages=row["Missed Cleavages"] )
def apply(self): self.detail.save() self.species.save(self.detail) obj = cmodels.Chromosome if self.is_chromosome else cmodels.Plasmid chromosome = obj.objects.for_species(self.species).for_wid(self.chromosome, create=True) chromosome.name = self.name chromosome.sequence = str(self.record.seq) # Cast needed, otherwise revision-compare fails! chromosome.length = len(self.record.seq) chromosome.species = self.species chromosome.save(self.detail) if self.record.dbxrefs: for xref in self.record.dbxrefs: # BioPython doesnt always properly split the db xrefs xref = xref.split(" ") for x in xref: if ":" in x: source, xid = x.split(":") x = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source=source, xid=xid ) chromosome.cross_references.add(x) if "references" in self.annotation: for ref in self.annotation["references"]: # calculate the wid if ref.pubmed_id: wid = "PUB_" + ref.pubmed_id name = "Pubmed #" + ref.pubmed_id elif ref.medline_id: wid = "MED_" + ref.medline_id name = "Pubmed #" + ref.medline_id else: publication = cmodels.PublicationReference.objects.filter( authors__exact=ref.authors, title__exact=ref.title, publication__exact=ref.journal ) next_id = 0 if publication.exists(): wid = publication[0].wid name = publication[0].name else: refs = cmodels.PublicationReference.objects.filter(wid__startswith="REF_") if refs.exists(): last = refs.reverse()[0] next_id = int(last.wid[4:], 10) + 1 wid = "REF_" + "%04d" % (next_id) name = "Reference #%04d" % (next_id) else: wid = "REF_0001" name = "Reference #0001" pubref = cmodels.PublicationReference.objects.for_wid(slugify(wid), create=True) pubref.name = name pubref.authors = ref.authors pubref.title = ref.title pubref.publication = ref.journal pubref.species = self.species pubref.save(self.detail) if ref.pubmed_id: xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source="PUBMED", xid=ref.pubmed_id ) pubref.cross_references.add(xref) if ref.medline_id: xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source="MEDLINE", xid=ref.medline_id ) pubref.cross_references.add(xref) chromosome.publication_references.add(pubref) if "gi" in self.annotation: xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, xid=self.annotation["gi"], source="GI" ) chromosome.cross_references.add(xref) features = self.record.features if len(features) > 0: if features[0].type == "source": if "db_xref" in features[0].qualifiers: for xref in features[0].qualifiers["db_xref"]: if ":" in xref: source, xid = xref.split(":") xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source=source, xid=xid ) chromosome.cross_references.add(xref) gene_features = filter(lambda x: x.type == "gene", features) cds_features = filter(lambda x: x.type in ["CDS", "ncRNA", "rRNA", "tmRNA", "tRNA"], features) gene_map = {} for g in gene_features: if not "locus_tag" in g.qualifiers: self.stderr.write("WARN: " + str(g) + " without locus") continue loci = g.qualifiers["locus_tag"][0] if loci in gene_map: raise ValueError("locus_tag " + loci + " appeared twice") gene_map[loci] = g cds_map = {} for c in cds_features: if not "locus_tag" in c.qualifiers: self.stderr.write("WARN: " + str(c) + " without locus") continue loci = c.qualifiers["locus_tag"][0] if loci in cds_map: raise ValueError("locus_tag " + loci + " appeared twice") if loci in gene_map: cds_map[loci] = c sorted_cds_values = sorted(cds_map.values(), key=lambda x: x.qualifiers["locus_tag"]) for i, v in enumerate(sorted_cds_values): qualifiers = v.qualifiers if not self.species.genetic_code: if "transl_table" in qualifiers: self.species.genetic_code = qualifiers["transl_table"][0] self.species.save(self.detail) g = cmodels.Gene.objects.for_species(self.species).for_wid(slugify(qualifiers["locus_tag"][0]), create=True) if hasattr(self, "notify_progress"): outstr = "Importing Gene %s (%d/%d)" % (g.wid, i + 1, len(cds_map.values())) self.notify_progress(current=i + 1, total=len(cds_map.values()), message=outstr) g.chromosome = chromosome if "gene" in qualifiers: g.name = qualifiers["gene"][0] g.symbol = qualifiers["gene"][0] g.direction = "f" if v.location.strand == 1 else "r" # __len__ because len() fails for numbers < 0 # Joins output the wrong length if v.location.__len__() < 0: g.length = v.location.__len__() + len(self.record.seq) else: g.length = len(v.location) g.coordinate = v.location.start + 1 if "f" else v.location.start if "note" in qualifiers: g.comments = "\n".join(qualifiers["note"]) g.species = self.species g.save(self.detail) if "db_xref" in qualifiers: for xref in qualifiers["db_xref"]: if ":" in xref: source, xid = xref.split(":") xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source=source, xid=xid ) g.cross_references.add(xref) if "EC_number" in qualifiers: for ec in qualifiers["EC_number"]: xref = cmodels.CrossReference.objects.get_or_create_with_revision(self.detail, source="EC", xid=ec) g.cross_references.add(xref) if "gene_synonym" in qualifiers: for synonym in qualifiers["gene_synonym"]: # Inconsistency: Multiple synonyms appear in one entry, # why don't they split them like for all other items? for syn in synonym.split(";"): obj = cmodels.Synonym.objects.get_or_create_with_revision(self.detail, name=syn.strip()) g.synonyms.add(obj) if "protein_id" in qualifiers: protxref = qualifiers["protein_id"][0] wid = slugify(g.wid + "_Monomer") protein = cmodels.ProteinMonomer.objects.for_species(self.species).for_wid(wid, create=True) if "product" in qualifiers: protein.name = qualifiers["product"][0] xref = cmodels.CrossReference.objects.get_or_create_with_revision( self.detail, source="RefSeq", xid=protxref ) protein.gene = g protein.species = self.species protein.save(self.detail) protein.cross_references.add(xref) if v.type == "CDS": v.type = "mRNA" t = cmodels.Type.objects.for_wid(wid=slugify(v.type), create=True) t.name = v.type t.species = self.species t.save(self.detail) g.type.add(t) if hasattr(self, "notify_progress"): outstr = "Assigning KEGG pathways" self.notify_progress(current=len(cds_map.values()), total=len(cds_map.values()), message=outstr) cmodels.Pathway.add_kegg_pathway(self.species, self.detail)