def _process_genes(self, taxid, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) col = list(self.columns['bmq_attributes']) if taxid != '9606' and 'hgnc_id' in col: col.remove('hgnc_id') col_exp = [ self.columns['bmq_headers'][self.columns['bmq_attributes'].index(x)] for x in col] LOG.info("Processing Ensembl genes for NCBITaxon:%s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t') row = next(reader) if not self.check_fileheader(col_exp, row): pass for row in reader: ensembl_gene_id = row[col.index('ensembl_gene_id')] external_gene_name = row[col.index('external_gene_name')] description = row[col.index('description')].strip() gene_biotype = row[col.index('gene_biotype')].strip() entrezgene = row[col.index('entrezgene_id')].strip() ensembl_peptide_id = row[col.index('ensembl_peptide_id')].strip() uniprotswissprot = row[col.index('uniprotswissprot')].strip() hgnc_curie = None # in the case of human genes, we also get the hgnc id, if taxid == '9606' and 'hgnc_id' in col: hgnc_curie = row[col.index('hgnc_id')].strip() if self.test_mode and entrezgene != '' and \ entrezgene not in self.gene_ids: continue gene_id = 'ENSEMBL:' + ensembl_gene_id entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None gene_type_id = self.resolve( gene_biotype, mandatory=False, default=self.globaltt['polypeptide']) model.addClassToGraph( gene_id, external_gene_name, gene_type_id, description) if entrezgene != '': if taxid == '9606': # Use HGNC for eq in human data model.addXref(gene_id, entrez_curie) else: model.addEquivalentClass(gene_id, entrez_curie) if hgnc_curie is not None and hgnc_curie != '': model.addEquivalentClass(gene_id, hgnc_curie) geno.addTaxon('NCBITaxon:' + taxid, gene_id) if ensembl_peptide_id is not None and ensembl_peptide_id != '': peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id) model.addIndividualToGraph(peptide_curie, None, gene_type_id) geno.addGeneProduct(gene_id, peptide_curie) if uniprotswissprot != '': uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot) model.addIndividualToGraph(uniprot_curie, None, gene_type_id) geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.test_mode and limit is not None and reader.line_num > limit: break
def _process_genes(self, taxid, limit=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 logger.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: raise ValueError("Data error for file %s", raw) (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene, peptide_id, uniprot_swissprot) = row[0:7] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[7] else: hgnc_id = None if self.testMode and entrezgene != '' \ and int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:' + ensembl_gene_id peptide_curie = 'ENSEMBL:{}'.format(peptide_id) uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot) entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None # gene_type_id = self._get_gene_type(gene_biotype) gene_type_id = None model.addClassToGraph( gene_id, external_gene_name, gene_type_id, description) model.addIndividualToGraph(peptide_curie, None, self._get_gene_type("polypeptide")) model.addIndividualToGraph(uniprot_curie, None, self._get_gene_type("polypeptide")) if entrezgene != '': model.addEquivalentClass(gene_id, entrez_curie) if hgnc_id is not None and hgnc_id != '': model.addEquivalentClass(gene_id, hgnc_id) geno.addTaxon('NCBITaxon:'+taxid, gene_id) if peptide_id != '': geno.addGeneProduct(gene_id, peptide_curie) if uniprot_swissprot != '': geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_genes(self, taxid, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 LOG.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: LOG.warning("Too few columns in: " + row) raise ValueError("Data error for file %s", raw) (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene, ensembl_peptide_id, uniprotswissprot) = row[0:7] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[7] else: hgnc_id = None if self.test_mode and entrezgene != '' and \ int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:' + ensembl_gene_id peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id) uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot) entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None gene_biotype = gene_biotype.strip() gene_type_id = self.resolve(gene_biotype, False) if gene_type_id == gene_biotype.strip(): # did not resolve gene_type_id = self.globaltt['polypeptide'] model.addClassToGraph( gene_id, external_gene_name, gene_type_id, description) model.addIndividualToGraph(peptide_curie, None, gene_type_id) model.addIndividualToGraph(uniprot_curie, None, gene_type_id) if entrezgene != '': if taxid == '9606': # Use HGNC for eq in human data model.addXref(gene_id, entrez_curie) else: model.addEquivalentClass(gene_id, entrez_curie) if hgnc_id is not None and hgnc_id != '': model.addEquivalentClass(gene_id, hgnc_id) geno.addTaxon('NCBITaxon:'+taxid, gene_id) if ensembl_peptide_id != '': geno.addGeneProduct(gene_id, peptide_curie) if uniprotswissprot != '': geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.test_mode and limit is not None and line_counter > limit: break return
def _process_genes(self, taxid, limit=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 logger.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: raise ValueError("Data error for file %s", raw) (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene, peptide_id, uniprot_swissprot) = row[0:7] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[7] else: hgnc_id = None if self.testMode and entrezgene != '' \ and int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:' + ensembl_gene_id peptide_curie = 'ENSEMBL:{}'.format(peptide_id) uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot) entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None # gene_type_id = self._get_gene_type(gene_biotype) gene_type_id = None model.addClassToGraph(gene_id, external_gene_name, gene_type_id, description) model.addIndividualToGraph(peptide_curie, None, self._get_gene_type("polypeptide")) model.addIndividualToGraph(uniprot_curie, None, self._get_gene_type("polypeptide")) if entrezgene != '': model.addEquivalentClass(gene_id, entrez_curie) if hgnc_id is not None and hgnc_id != '': model.addEquivalentClass(gene_id, hgnc_id) geno.addTaxon('NCBITaxon:' + taxid, gene_id) if peptide_id != '': geno.addGeneProduct(gene_id, peptide_curie) if uniprot_swissprot != '': geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.testMode \ and limit is not None and line_counter > limit: break return