def map_gene_and_cds_to_protein(protein): """ Mapping Proteins to CDS :param protein: :return: """ if protein and protein.parent: for tag in protein.parent: gene = Gene.select(graph, tag).first() if not gene: gene = PseudoGene.select(graph, tag).first() if gene: gene.encodes.add(protein) graph.push(gene) protein.encoded_by.add(gene) graph.push(protein) # ens_id = map_ue_to_ens_trs(entry['Entry'])[0] protein_entry = str(protein.uniquename).strip() ens_id = eu_mapping(protein_entry, 'ENSEMBLGENOME_TRS_ID') if ens_id is not None: cds = CDS.select(graph, ens_id[0]).first() if cds: # Protein-[derives_from]->CDS protein.derives_from.add(cds) graph.push(protein) cds.derived.add(protein) graph.push(cds)
def map_srna_to_mrna(text_file): """ Map sRNA to the mRNA they regulate :param text_file: :return: """ sys.stdout.write("\nAdding sRNA data...") with open(text_file) as text_file: next(text_file) for line in text_file: tab_split = line.split('\t') srna_name = tab_split[0] # srna_fmax = tab_split[2] mrna_name = tab_split[7] # mrna_fmax = tab_split[9] ncrna = NCRna.select(graph).where("_.name=~'(?i).*{}.*'".format( srna_name.lower())).first() if ncrna: mrnas = mrna_name.split() if "-" in mrna_name: mrnas = mrna_name.split("-") for name in mrnas: gene = Gene.select(graph).where( "_.uniquename=~'(?i).*{}.*'".format( name.lower())).first() if gene: ncrna.regulates_gene.add(gene) graph.push(ncrna)
def map_pathway_to_proteins(pathway_genes, path): for g_id in pathway_genes: g_id = "Rv" + \ g_id.strip("RVBD_") if "RV" in g_id else g_id # Protein parent is stored as an array gene = Gene.select(graph, g_id).first() if gene: for protein in gene.encodes: protein.pathway.add(path) graph.push(protein) path.protein.add(protein) graph.push(path)
def map_gene_to_orthologs(locus_tags): """ Mapping Genes to orthologs :param locus_tags: :return: """ sys.stdout.write("\nMapping Orthologs...\n") for tag_list in locus_tags: for tag in tag_list: gene = Gene.select(graph, tag).first() if gene: if tag.startswith('Rv'): ortholog = fetch_ortholog(locus_tag=str(tag)) if ortholog: orthologous_gene = Gene.select(graph, str(ortholog)).first() if orthologous_gene: gene.orthologous_to.add(orthologous_gene) orthologous_gene.orthologous_to_.add(gene) graph.push(gene) graph.push(orthologous_gene) sys.stdout.write("\nMapped Orthologs")
def create_variant_site_nodes(self, record, known_sites, annotation, v_set=None, c_set=None): """ Create VariantSite Nodes :return: """ pos = record.POS chrom = record.CHROM ref_allele = record.REF alt_allele = annotation[0] gene = annotation[4] consequence = annotation[10] if annotation[10] != '' else annotation[9] # A variant can affect multiple genes. # E.g a variant can be DOWNSTREAM from one gene and # UPSTREAM from another gene. gene_pos = str(pos)+gene if pos in known_sites: # we have already seen this variant site in another VCF file # data structure known_sites: # key: pos (genomic position) and gene # value: VariantSite v_site = known_sites[gene_pos] # known_sites[pos][1].append(call) else: # we don't know about this variant site yet v_site = Variant(chrom=str(chrom), pos=pos, ref_allele=str(ref_allele), alt_allele=str(alt_allele), gene=gene, consequence=consequence, pk=v_set.name + gene_pos, impact=annotation[2]) v_site.biotype = annotation[7] v_site.effect = annotation[1] self.graph.create(v_site) known_sites[gene_pos] = v_site if c_set: v_site.belongs_to_cset.add(c_set) c_set.has_variants.add(v_site) gene = Gene.select(self.graph, str(v_site.gene)).first() if gene: v_site.occurs_in.add(gene) self.graph.push(v_site) if v_set: v_set.has_variant.add(v_site) self.graph.push(v_set) return known_sites
def create_operon_nodes(text_file=None): """ Adding functional categories to Feature Nodes :param text_file: :return: """ sys.stdout.write("\nAdding operon data...") with open(text_file) as text_file: for line in text_file: if 'OPERON' in str(line): tab_split = line.split('\t') # locus = tab_split[0] # gene_name = tab_split[1] # name_operon = tab_split[10] locus_operon = tab_split[11] description = tab_split[7] operon = Operon() # Must we use the product as the uniquename operon.uniquename = locus_operon operon.description = description graph.create(operon) genes = locus_operon.split(',') for locus_tag in genes: gene = Gene.select(graph, locus_tag.strip()).first() if gene: gene.member_of.add(operon) if len(genes) == 1: gene.co_regulated.add(gene) else: # Let's not build reverse co-regulated rel for g_id in genes[1:]: g = Gene.select(graph, g_id.strip()).first() if g: gene.co_regulated.add(g) graph.push(gene) operon.gene.add(gene) graph.push(operon)
def create_gene_nodes(feature, organism): """ Create Gene Nodes :param organism: :param feature: :return: """ names = get_feature_name(feature) name = names.get("Name", names.get("UniqueName")) unique_name = names.get("UniqueName", name) description = feature.qualifiers.get("description", "") biotype = feature.qualifiers['biotype'][0] parent = feature.qualifiers.get("Parent", " ")[0] gene = Gene() gene.name = name gene.uniquename = unique_name gene.parent = parent[parent.find(':') + 1:] gene.biotype = biotype gene.description = description graph.create(gene) gene.belongs_to.add(organism) graph.push(gene) gene_dict[unique_name] = gene
def create_known_mutation_nodes(**kwargs): """ Create Known mutations :return: """ fluoroquinolones = [ "ciprofloxacin", "ofloxacin", "levofloxacin", "moxifloxacin" ] aminoglyconsides = ["amikacin", "kanamycin", "streptomycin", "capreomycin"] v_set = VariantSet(name=kwargs.get("vset_name", ""), owner=kwargs.get("vset_owner", "")) call_set = CallSet(name=kwargs.get("cset_name", "")) v_set.has_callsets.add(call_set) call_set.belongs_to_vset.add(v_set) graph.create(v_set) graph.create(call_set) variant = Variant(chrom=kwargs.get("chrom", ""), pos=kwargs.get("pos", ""), ref_allele=kwargs.get("ref_allele", ""), alt_allele=kwargs.get("alt_allele", ""), gene=kwargs.get("gene", ""), pk=kwargs.get("pk", ""), consequence=kwargs.get("consequence", "")) variant.loc_in_seq = kwargs.get("loc_in_seq") variant.promoter = kwargs.get("promoter") variant.biotype = kwargs.get("biotype") variant.drug = kwargs.get("drug_name") variant.sources = kwargs.get("sources") variant.belongs_to_cset.add(call_set) call_set.has_variants.add(variant) def map_drug_class_to_variant(_class): """ Map all drugs in class to variant :param _class: :return: """ for item in _class: drugs = Drug.select(graph).where( "_.name=~'(?i).*{}.*'".format(item)) for _drug in drugs: variant.resistant_to.add(_drug) if kwargs.get("drug_name") == "aminoglycosides": map_drug_class_to_variant(aminoglyconsides) elif kwargs.get("drug_name") == "fluoroquinolones": map_drug_class_to_variant(fluoroquinolones) else: for drug_id in kwargs.get("drugbank_id"): drug = Drug.select(graph, str(drug_id).upper()).first() if drug: variant.resistant_to.add(drug) elif drug_id and kwargs.get("drug_name"): drug = Drug(accession=drug_id.strip(), name=kwargs.get("drug_name").capitalize()) graph.create(drug) variant.resistant_to.add(drug) gene_name = str(kwargs.get("gene")).lower() gene = Gene.select(graph).where( f"_.name=~'(?i).*{gene_name}.*' OR _.uniquename=~'(?i).*{gene_name}.*'" ).first() if gene: variant.occurs_in.add(gene) else: rna = RRna.select(graph).where( f"_.name=~'(?i).*{gene_name}.*' OR _.uniquename=~'(?i).*{gene_name}.*'" ).first() if rna: variant.occurs_in_.add(rna) graph.create(variant) graph.push(call_set)
def test_rv0001(): gene = Gene.select(graph, "Rv0001").first() assert gene.name == "dnaA" assert gene.category != "" assert gene.residues != ""