def _load_idmapping(datadir, session, organism_set): def add(input_ns, input_n, output_ns, output_n): session.add(Translation(input_ns, input_n, output_ns, output_n)) input = _gzip_open(path.join(datadir, 'idmapping_selected.tab.gz')) loaded = 0 seen_IDs = set() for line in input: UniProtKB_AC, \ UniProtKB_ID, \ GeneID_EntrezGene, \ RefSeq, \ GI, \ PDB, \ GO, \ UniRef100, \ UniRef90, \ UniRef50, \ UniParc, \ PIR, \ NCBI_taxon, \ MIM, \ UniGene, \ PubMed, \ EMBL, \ EMBL_CDS, \ Ensembl, \ Ensembl_TRS, \ Ensembl_PRO, \ Additional_PubMed = line[:-1].split('\t') if organism_set is not None and \ _name_guess(UniProtKB_ID) not in organism_set: continue add('uniprot:accession', UniProtKB_AC, 'uniprot:name', UniProtKB_ID) add('uniprot:accession', UniProtKB_AC, 'ensembl:gene_id', Ensembl) add('uniprot:accession', UniProtKB_AC, 'ensembl:peptide_id', Ensembl_PRO) for embl_cds in EMBL_CDS.split('; '): add('embl:cds', embl_cds, 'uniprot:name', UniProtKB_ID) if not UniProtKB_ID in seen_IDs: Ensembl = Ensembl.split('; ') Ensembl_PRO = Ensembl_PRO.split('; ') add('uniprot:name', UniProtKB_ID, 'ensembl:gene_id', Ensembl[0]) add('uniprot:name', UniProtKB_ID, 'ensembl:peptide_id', Ensembl_PRO[0]) for e in Ensembl: add('ensembl:gene_id', e, 'uniprot:name', UniProtKB_ID) for e in Ensembl_PRO: add('ensembl:peptide_id', e, 'uniprot:name', UniProtKB_ID) seen_IDs.add(UniProtKB_ID) if len(session.new) > 512: session.commit() loaded += 1 session.commit() return loaded
def load(datadir, create_session=None, mouse_only=True): """ nr_loaded = load(datadir, create_session={backend.create_session}, mouse_only=True) Parameters ---------- datadir : str Directory containing the gene2ensembl.gz file create_session : callable, optional a callable that returns an sqlalchemy session mouse_only : bool, optional whether to only load mouse data Currently, only ``mouse_only=True`` is implemented! Returns ------- nr_loaded : int Nr. of entries loaded """ from waldo.backend import call_create_session filename = path.join(datadir, _inputfilename) session = call_create_session(create_session) input = _gzip_open(filename) header = input.readline() if not mouse_only: raise NotImplementedError("waldo.refseq.load: Cannot load non-mouse entries") nr_loaded = 0 for line in input: tax_id, gene_id, ensembl_gene, rna_accession, emsembl_trans, protein_accession, ensembl_peptide = line.strip().split( "\t" ) if ensembl_peptide.find("ENSMUSP") == -1: continue protein_accession, version = protein_accession.split(".") session.add(Translation("ensembl:peptide_id", ensembl_peptide, "refseq:accession", protein_accession)) session.add(Translation("refseq:accession", protein_accession, "ensembl:peptide_id", ensembl_peptide)) session.add(Translation("ensembl:gene_id", ensembl_gene, "refseq:accession", protein_accession)) session.add(Translation("refseq:accession", protein_accession, "ensembl:gene_id", ensembl_gene)) session.commit() nr_loaded += 1 return nr_loaded
def load(datadir, create_session=None, species=('Mus Musculus', 'H**o Sapiens')): ''' nr_loaded = load(datadir, create_session={backend.create_session}, species=['Mus Musculus, H**o Sapiens') Load NOG entries file file into database Parameters ---------- datadir : str Directory containing the maNOG.mapping.txt.gz file create_session : callable, optional a callable object that returns an sqlalchemy session species : sequence species to load Returns ------- nr_loaded : integer Nr. of entries loaded ''' from waldo.backend import call_create_session session = call_create_session(create_session) if datadir is None: datadir = _datadir nr_loaded = 0 filename = path.join(datadir, _inputfilename) inputfile = _gzip_open(filename) header = inputfile.readline() for line in inputfile: prot_name, \ start, \ end, \ group, \ description = line.strip().split('\t') _, prot_name = prot_name.split('.') group = group[len('maNOG'):] group = int(group) for sp in species: if _accept_species(sp, prot_name): entry = models.NogEntry(prot_name, group) session.add(entry) session.commit() nr_loaded += 1 break return nr_loaded
def load(datadir, create_session=None): ''' nr_entries = load(datadir, create_session={backend.create_session}) Load Gene Ontology OBO file into database Parameters ---------- datadir : Directory containing GO files create_session : a callable object that returns an sqlalchemy session Returns ------- nr_entries : Nr of entries ''' from waldo.backend import call_create_session session = call_create_session(create_session) filename = path.join(datadir, _inputfilename) if not path.exists(filename) and path.exists(filename + '.gz'): input = _gzip_open(filename) else: input = open(filename) id = None in_term = False loaded = 0 for term in _parse_terms(input): if term['is_obsolete']: continue session.add( Term(id=term['id'][0], name=term['name'][0], namespace=term['namespace'][0])) for rel in ('is_a','part_of'): for t in term[rel]: r = TermRelationship(id, t, rel) session.add(r) loaded += 1 # This check is ugly, but commit() is rather slow # The speed up is worth it: if (loaded % 512) == 0: session.commit() session.commit() return loaded
def read(input): """ for seq in read(input): ... Read a fasta file Iterates over the sequences in the file as `sequence` objects. comments (lines starting with ';') are ignored. Parameters ---------- `input` : either a file or the name of a file. """ if type(input) == str: if input.endswith('.gz'): input = _gzip_open(input) else: input = file(input) seq_items = [] header = None for line in input: line = line.strip() if not line or line[0] == ';': continue elif line[0] == '>': if header is not None: seq = "".join(seq_items) seq_items = [] yield sequence(header, seq) header = line[1:] # eat '>' else: seq_items.append(line) if header is not None: seq = "".join(seq_items) yield sequence(header, seq)
def _load_uniprot_sprot(datadir, session, organism_set): input = _gzip_open(path.join(datadir, _inputfilename)) loaded = 0 organisms_select = etree.XPath('up:organism/*[@type="scientific"]/text()', namespaces=_ns) accession_select = etree.XPath('up:accession/text()', namespaces=_ns) rname_select = etree.XPath('up:protein/up:recommendedName/up:fullName/text()', namespaces=_ns) primary_name_select = etree.XPath('up:gene/up:name[@type="primary"]/text()', namespaces=_ns) citation_select = etree.XPath('up:reference/up:citation', namespaces=_ns) author_select = etree.XPath('up:authorList/up:person/text()', namespaces=_ns) for _event, element in _safe_iterparse(input, tag=_p+'entry'): organisms = map(unicode, organisms_select(element)) if organism_set is not None: if not len(set(organisms) & organism_set): _cleanup(element) continue accessions = map(unicode, accession_select(element)) name = unicode(element.findtext(_p+'name')) rname = _safe_head(rname_select(element)) gname = _safe_head(primary_name_select(element)) sequence = unicode(element.findtext(_p+'sequence')) comments = [models.Comment(c.get('type'), unicode(c.findtext(_p+'text'))) for c in element.iterchildren(_p+'comment')] references = [] go_annotations = [] for citation in citation_select(element): ref = citation.getparent() key = ref.get('key') type = citation.get('type') title = citation.findtext(_p+'title') if title is None or key is None: continue authors = author_select(citation) authors = " AND ".join(authors) dbReference = citation.findall(_p + 'dbReference') dbrefs = filter(lambda x : x.get('type') == 'DOI', dbReference) dbRefString = '' if len(dbrefs): dbRefString = "%s:%s" % (dbrefs[0].get('type'), dbrefs[0].get('id')) else: dbrefs = filter(lambda x : x.get('type') == 'PubMed', dbReference) if len(dbrefs): dbRefString = "%s:%s" % (dbrefs[0].get('type'), dbrefs[0].get('id')) references.append(models.Reference(key, type, title, authors, dbRefString)) for dbref in element.iterchildren(_p+'dbReference'): if dbref.get('type') == 'Go': id = dbref.get('id') evidence_code = '' for prop in dbref.findall(_p+'property'): if prop.get('type') == 'evidence': evidence_code = prop.get('value'); go_annotations.append(models.GoAnnotation(id, evidence_code)) _cleanup(element) entry = models.Entry(name, rname, gname, accessions, comments, references, go_annotations, sequence, organisms) session.add(entry) loaded += 1 if len(session.new) > 512: session.commit() session.commit() return loaded