Esempio n. 1
0
def _load_idmapping(datadir, session, organism_set):
    def add(input_ns, input_n, output_ns, output_n):
        session.add(Translation(input_ns, input_n, output_ns, output_n))
    input = _gzip_open(path.join(datadir, 'idmapping_selected.tab.gz'))
    loaded = 0
    seen_IDs = set()
    for line in input:
        UniProtKB_AC, \
            UniProtKB_ID, \
            GeneID_EntrezGene, \
            RefSeq, \
            GI, \
            PDB, \
            GO, \
            UniRef100, \
            UniRef90, \
            UniRef50, \
            UniParc, \
            PIR, \
            NCBI_taxon, \
            MIM, \
            UniGene, \
            PubMed, \
            EMBL, \
            EMBL_CDS, \
            Ensembl, \
            Ensembl_TRS, \
            Ensembl_PRO, \
            Additional_PubMed = line[:-1].split('\t')
        if organism_set is not None and \
            _name_guess(UniProtKB_ID) not in organism_set:
            continue

        add('uniprot:accession', UniProtKB_AC, 'uniprot:name', UniProtKB_ID)
        add('uniprot:accession', UniProtKB_AC, 'ensembl:gene_id', Ensembl)
        add('uniprot:accession', UniProtKB_AC, 'ensembl:peptide_id', Ensembl_PRO)
        for embl_cds in EMBL_CDS.split('; '):
            add('embl:cds', embl_cds, 'uniprot:name', UniProtKB_ID)
        if not UniProtKB_ID in seen_IDs:
            Ensembl = Ensembl.split('; ')
            Ensembl_PRO = Ensembl_PRO.split('; ')
            add('uniprot:name', UniProtKB_ID, 'ensembl:gene_id', Ensembl[0])
            add('uniprot:name', UniProtKB_ID, 'ensembl:peptide_id', Ensembl_PRO[0])
            for e in Ensembl:
                add('ensembl:gene_id', e, 'uniprot:name', UniProtKB_ID)
            for e in Ensembl_PRO:
                add('ensembl:peptide_id', e, 'uniprot:name', UniProtKB_ID)
            seen_IDs.add(UniProtKB_ID)
        if len(session.new) > 512:
            session.commit()
        loaded += 1
    session.commit()
    return loaded
Esempio n. 2
0
def load(datadir, create_session=None, mouse_only=True):
    """
    nr_loaded = load(datadir, create_session={backend.create_session}, mouse_only=True)

    Parameters
    ----------
    datadir : str
        Directory containing the gene2ensembl.gz file
    create_session : callable, optional
        a callable that returns an sqlalchemy session
    mouse_only : bool, optional
        whether to only load mouse data
        Currently, only ``mouse_only=True`` is implemented!

    Returns
    -------
    nr_loaded : int
        Nr. of entries loaded
    """
    from waldo.backend import call_create_session

    filename = path.join(datadir, _inputfilename)
    session = call_create_session(create_session)
    input = _gzip_open(filename)
    header = input.readline()

    if not mouse_only:
        raise NotImplementedError("waldo.refseq.load: Cannot load non-mouse entries")

    nr_loaded = 0
    for line in input:
        tax_id, gene_id, ensembl_gene, rna_accession, emsembl_trans, protein_accession, ensembl_peptide = line.strip().split(
            "\t"
        )
        if ensembl_peptide.find("ENSMUSP") == -1:
            continue
        protein_accession, version = protein_accession.split(".")
        session.add(Translation("ensembl:peptide_id", ensembl_peptide, "refseq:accession", protein_accession))
        session.add(Translation("refseq:accession", protein_accession, "ensembl:peptide_id", ensembl_peptide))
        session.add(Translation("ensembl:gene_id", ensembl_gene, "refseq:accession", protein_accession))
        session.add(Translation("refseq:accession", protein_accession, "ensembl:gene_id", ensembl_gene))
        session.commit()
        nr_loaded += 1
    return nr_loaded
Esempio n. 3
0
def load(datadir, create_session=None, species=('Mus Musculus', 'H**o Sapiens')):
    '''
    nr_loaded = load(datadir, create_session={backend.create_session}, species=['Mus Musculus, H**o Sapiens')

    Load NOG entries file file into database

    Parameters
    ----------
    datadir : str
        Directory containing the maNOG.mapping.txt.gz file
    create_session : callable, optional
        a callable object that returns an sqlalchemy session
    species : sequence
        species to load

    Returns
    -------
    nr_loaded : integer
        Nr. of entries loaded
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    if datadir is None: datadir = _datadir
    nr_loaded = 0
    filename = path.join(datadir, _inputfilename)
    inputfile = _gzip_open(filename)
    header = inputfile.readline()
    for line in inputfile:
        prot_name, \
            start, \
            end, \
            group, \
            description = line.strip().split('\t')
        _, prot_name = prot_name.split('.')
        group = group[len('maNOG'):]
        group = int(group)
        for sp in species:
            if _accept_species(sp, prot_name):
                entry = models.NogEntry(prot_name, group)
                session.add(entry)
                session.commit()
                nr_loaded += 1
                break
    return nr_loaded
Esempio n. 4
0
def load(datadir, create_session=None):
    '''
    nr_entries = load(datadir, create_session={backend.create_session})

    Load Gene Ontology OBO file into database

    Parameters
    ----------
      datadir : Directory containing GO files
      create_session : a callable object that returns an sqlalchemy session
    Returns
    -------
      nr_entries : Nr of entries
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    filename = path.join(datadir, _inputfilename)
    if not path.exists(filename) and path.exists(filename + '.gz'):
        input = _gzip_open(filename)
    else:
        input = open(filename)

    id = None
    in_term = False
    loaded = 0
    for term in _parse_terms(input):
        if term['is_obsolete']:
            continue
        session.add(
            Term(id=term['id'][0], name=term['name'][0], namespace=term['namespace'][0]))
        for rel in ('is_a','part_of'):
            for t in term[rel]:
                r = TermRelationship(id, t, rel)
                session.add(r)
        loaded += 1
        # This check is ugly, but commit() is rather slow
        # The speed up is worth it:
        if (loaded % 512) == 0:
            session.commit()
    session.commit()
    return loaded
Esempio n. 5
0
def read(input):
    """
    for seq in read(input):
        ...

    Read a fasta file

    Iterates over the sequences in the file as `sequence` objects.
    comments (lines starting with ';') are ignored.

    Parameters
    ----------
      `input` : either a file or the name of a file.
    """
    if type(input) == str:
        if input.endswith('.gz'):
            input = _gzip_open(input)
        else:
            input = file(input)
    seq_items = []
    header = None
    for line in input:
        line = line.strip()
        if not line or line[0] == ';':
            continue
        elif line[0] == '>':
            if header is not None:
                seq = "".join(seq_items)
                seq_items = []
                yield sequence(header, seq)
            header = line[1:] # eat '>'
        else:
            seq_items.append(line)
    if header is not None:
        seq = "".join(seq_items)
        yield sequence(header, seq)
Esempio n. 6
0
def _load_uniprot_sprot(datadir, session, organism_set):
    input = _gzip_open(path.join(datadir, _inputfilename))
    loaded = 0
    organisms_select = etree.XPath('up:organism/*[@type="scientific"]/text()', namespaces=_ns)
    accession_select = etree.XPath('up:accession/text()', namespaces=_ns)
    rname_select = etree.XPath('up:protein/up:recommendedName/up:fullName/text()', namespaces=_ns)
    primary_name_select = etree.XPath('up:gene/up:name[@type="primary"]/text()', namespaces=_ns)
    citation_select = etree.XPath('up:reference/up:citation', namespaces=_ns)
    author_select = etree.XPath('up:authorList/up:person/text()', namespaces=_ns)

    for _event, element in _safe_iterparse(input, tag=_p+'entry'):
        organisms = map(unicode, organisms_select(element))

        if organism_set is not None:
            if not len(set(organisms) & organism_set):
                _cleanup(element)
                continue

        accessions = map(unicode, accession_select(element))
        name = unicode(element.findtext(_p+'name'))
        rname = _safe_head(rname_select(element))
        gname = _safe_head(primary_name_select(element))

        sequence = unicode(element.findtext(_p+'sequence'))
        comments = [models.Comment(c.get('type'), unicode(c.findtext(_p+'text'))) for c in element.iterchildren(_p+'comment')]
        references = []
        go_annotations = []

        for citation in citation_select(element):
            ref = citation.getparent()
            key = ref.get('key')
            type = citation.get('type')
            title = citation.findtext(_p+'title')
            if title is None or key is None:
                continue
            authors = author_select(citation)
            authors = " AND ".join(authors)

            dbReference = citation.findall(_p + 'dbReference')
            dbrefs = filter(lambda x : x.get('type') == 'DOI', dbReference)
            dbRefString = ''
            if len(dbrefs):
                dbRefString = "%s:%s" % (dbrefs[0].get('type'), dbrefs[0].get('id'))
            else:
                dbrefs = filter(lambda x : x.get('type') == 'PubMed', dbReference)
                if len(dbrefs):
                    dbRefString = "%s:%s" % (dbrefs[0].get('type'), dbrefs[0].get('id'))
            references.append(models.Reference(key, type, title, authors, dbRefString))

        for dbref in element.iterchildren(_p+'dbReference'):
            if dbref.get('type') == 'Go':
                id = dbref.get('id')
                evidence_code = ''
                for prop in dbref.findall(_p+'property'):
                    if prop.get('type') == 'evidence':
                        evidence_code = prop.get('value');
                go_annotations.append(models.GoAnnotation(id, evidence_code))

        _cleanup(element)

        entry = models.Entry(name, rname, gname, accessions, comments, references, go_annotations, sequence, organisms)
        session.add(entry)
        loaded += 1
        if len(session.new) > 512:
            session.commit()
    session.commit()
    return loaded