Example #1
0
def load(datadir, create_session=None):
    '''
    num_entries = load(datadir, create_session={backend.create_session})

    Load LOCATE database file information into local relational database

    Parameters
    ----------
    datadir : str
        Path to directory containing database files.
    create_session : callable, optional
        Callable object which returns an sqlalchemy session

    Returns
    -------
    num_entries : int
        Number of entries loaded into the local database

    References
    ----------
    To download database files: http://locate.imb.uq.edu.au/downloads.shtml
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)

    loaded = _loadfile(path.join(datadir, _mouse), 'Mus musculus', session)
    loaded += _loadfile(path.join(datadir, _human), 'H**o sapiens', session)
    return loaded
Example #2
0
def load(datadir, create_session=None, organism_set=set([u'Mus musculus', u'H**o sapiens'])):
    '''
    nr_loaded = load(datadir, create_session={backend.create_session}, organism_set={'Mus musculus', 'H**o sapiens'})

    Load uniprot into database

    Parameters
    ----------
    datadir : str
        Directory containing the XML Uniprot file
    create_session : callable, optional
        a callable object that returns an sqlalchemy session
    organism_set : set of str, optional
        If not None, only organisms in this set will be loaded. Defaults to
        ['Mus musculus', 'H**o sapiens']

    Returns
    -------
    nr_loaded : int
        Nr. of entries loaded. This double counts entries that are parsed both
        from SwissProt and from the ID mapping.
    '''
    from waldo.backend import call_create_session

    session = call_create_session(create_session)
    loaded = _load_uniprot_sprot(datadir, session, organism_set)
    loaded += _load_idmapping(datadir, session, organism_set)
    loaded += _load_sec_ac(datadir, session)
    return loaded
Example #3
0
def load(datadir, create_session=None):
    '''
    nr_loaded = load(datadir, create_session={backend.create_session})

    Loads gene_annotation.mgi and MRK_ENSEMBL.rpt files from MGI

    Parameters
    ----------
      datadir : str
        base directory for data.

    Returns
    -------
      Nr of annotation entries loaded.

    References
    ----------

    For the file formats see:
    ftp://ftp.informatics.jax.org/pub/reports/index.html
    http://www.geneontology.org/GO.format.gaf-1_0.shtml
    http://wiki.geneontology.org/index.php/GAF_2.0
    '''
    from waldo.backend import call_create_session
    if datadir is None: datadir = _datadir
    session = call_create_session(create_session)
    loaded = _load_gene_annotation(path.join(datadir, 'gene_association.mgi'), session)
    _load_mrk_ensembl(path.join(datadir, 'MRK_ENSEMBL.rpt'), session)
    _load_pubmed_ids(path.join(datadir, 'MRK_Reference.rpt'), session)
    return loaded
Example #4
0
def clear(create_session=None):
    '''
    clear()

    Removes all NOG related information
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    session.query(models.NogEntry).delete()
    session.commit()
Example #5
0
def clear(create_session=None):
    '''
    clear()

    Removes all Sequence related information
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    session.query(models.EnsemblSequence).delete()
    session.commit()
Example #6
0
def load(datadir, create_session=None):
    '''
    num_entries = load(datadir={data/}, create_session={backend.create_session})

    Load the data from a subcellular location annotations file into the local
    relational database

    Parameters
    ----------
    datadir : str, optional
        Base directory containing the annotations file
    create_session : callable, optional
        Callable object which returns an sqlalchemy session (default:
        waldo.backend.create_session)

    Returns
    -------
    num_entries : integer
        Number of entries loaded into the local database

    References
    ----------
      (none)

    '''
    import zipfile
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    zf = zipfile.ZipFile(path.join(datadir, _annot))
    inputf = zf.open(zf.filelist[0])


    # loop through the entries in the file
    csvreader = csv.reader(inputf, delimiter=',', quotechar='"')
    count = 0
    for row in csvreader:
        count += 1
        if count == 1:
            continue

        # loop through the list of comma-separated elements on this row
        gene, gene_name, main_loc, other_loc, expression_type, reliability, main_loc_go, other_loc_go = row

        locations = main_loc.split(";")
        if(other_loc != ""):
            locations += other_loc.split(";")

        for name in locations:
            session.add(models.Location(name, gene))

        session.add(models.Entry(gene))

        session.commit()

    return count - 1 # since the first row wasn't an entry
Example #7
0
def clear(create_session=None):
    '''
    clear()

    Removes all GO related information
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    session.query(SlimSet).delete()
    session.query(SlimTerm).delete()
    session.query(SlimMapping).delete()
    session.commit()
Example #8
0
def clear(create_session=None):
    '''
    clear()

    Removes all GO related information
    '''
    from waldo.backend import call_create_session
    from . import models
    session = call_create_session(create_session)
    session.query(models.Term).delete()
    session.query(models.TermRelationship).delete()
    session.commit()
Example #9
0
def clear(create_session=None):
    '''
    clear()

    Removes all LOCATE related information
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    session.query(models.Isoform).delete()
    session.query(models.Image).delete()
    session.query(models.LocatePrediction).delete()
    session.query(models.Literature).delete()
    session.query(models.LocateAnnotation).delete()
    session.query(models.ExternalReference).delete()
    session.query(models.LocateEntry).delete()
    session.commit()
Example #10
0
def load(datadir, create_session=None, mouse_only=True):
    """
    nr_loaded = load(datadir, create_session={backend.create_session}, mouse_only=True)

    Parameters
    ----------
    datadir : str
        Directory containing the gene2ensembl.gz file
    create_session : callable, optional
        a callable that returns an sqlalchemy session
    mouse_only : bool, optional
        whether to only load mouse data
        Currently, only ``mouse_only=True`` is implemented!

    Returns
    -------
    nr_loaded : int
        Nr. of entries loaded
    """
    from waldo.backend import call_create_session

    filename = path.join(datadir, _inputfilename)
    session = call_create_session(create_session)
    input = _gzip_open(filename)
    header = input.readline()

    if not mouse_only:
        raise NotImplementedError("waldo.refseq.load: Cannot load non-mouse entries")

    nr_loaded = 0
    for line in input:
        tax_id, gene_id, ensembl_gene, rna_accession, emsembl_trans, protein_accession, ensembl_peptide = line.strip().split(
            "\t"
        )
        if ensembl_peptide.find("ENSMUSP") == -1:
            continue
        protein_accession, version = protein_accession.split(".")
        session.add(Translation("ensembl:peptide_id", ensembl_peptide, "refseq:accession", protein_accession))
        session.add(Translation("refseq:accession", protein_accession, "ensembl:peptide_id", ensembl_peptide))
        session.add(Translation("ensembl:gene_id", ensembl_gene, "refseq:accession", protein_accession))
        session.add(Translation("refseq:accession", protein_accession, "ensembl:gene_id", ensembl_gene))
        session.commit()
        nr_loaded += 1
    return nr_loaded
Example #11
0
def load(datadir, create_session=None, species=('Mus Musculus', 'H**o Sapiens')):
    '''
    nr_loaded = load(datadir, create_session={backend.create_session}, species=['Mus Musculus, H**o Sapiens')

    Load NOG entries file file into database

    Parameters
    ----------
    datadir : str
        Directory containing the maNOG.mapping.txt.gz file
    create_session : callable, optional
        a callable object that returns an sqlalchemy session
    species : sequence
        species to load

    Returns
    -------
    nr_loaded : integer
        Nr. of entries loaded
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    if datadir is None: datadir = _datadir
    nr_loaded = 0
    filename = path.join(datadir, _inputfilename)
    inputfile = _gzip_open(filename)
    header = inputfile.readline()
    for line in inputfile:
        prot_name, \
            start, \
            end, \
            group, \
            description = line.strip().split('\t')
        _, prot_name = prot_name.split('.')
        group = group[len('maNOG'):]
        group = int(group)
        for sp in species:
            if _accept_species(sp, prot_name):
                entry = models.NogEntry(prot_name, group)
                session.add(entry)
                session.commit()
                nr_loaded += 1
                break
    return nr_loaded
Example #12
0
def clear(create_session=None):
    '''
    clear(create_session={backend.create_session})

    Removes all Uniprot related information

    Parameters
    ----------
    create_session : callable, optional
        callable which returns a session
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    session.query(models.Accession).delete()
    session.query(models.GoAnnotation).delete()
    session.query(models.Reference).delete()
    session.query(models.Comment).delete()
    session.query(models.Organism).delete()
    session.query(models.UniprotEntry).delete()
    session.commit()
Example #13
0
def load(datadir, create_session=None):
    '''
    nr_entries = load(datadir, create_session={backend.create_session})

    Load Gene Ontology OBO file into database

    Parameters
    ----------
      datadir : Directory containing GO files
      create_session : a callable object that returns an sqlalchemy session
    Returns
    -------
      nr_entries : Nr of entries
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    filename = path.join(datadir, _inputfilename)
    if not path.exists(filename) and path.exists(filename + '.gz'):
        input = _gzip_open(filename)
    else:
        input = open(filename)

    id = None
    in_term = False
    loaded = 0
    for term in _parse_terms(input):
        if term['is_obsolete']:
            continue
        session.add(
            Term(id=term['id'][0], name=term['name'][0], namespace=term['namespace'][0]))
        for rel in ('is_a','part_of'):
            for t in term[rel]:
                r = TermRelationship(id, t, rel)
                session.add(r)
        loaded += 1
        # This check is ugly, but commit() is rather slow
        # The speed up is worth it:
        if (loaded % 512) == 0:
            session.commit()
    session.commit()
    return loaded
Example #14
0
def load(datadir, create_session=None):
    '''
    nr_loaded = load(datadir, create_session={backend.create_session})

    Load ENSEMBL FASTA file into database

    Parameters
    ----------
    datadir : str
        Directory containing the FASTA file
    create_session : callable, optional
        a callable object that returns an sqlalchemy session

    Returns
    -------
    nr_loaded : integer
        Nr. of entries loaded
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    inputfilename = glob.glob(path.join(datadir, 'Mus_musculus.*.pep.all.fa.gz'))[0]
    filename = path.join(inputfilename)
    nr_loaded = 0
    for seq in fasta.read(filename):
        htokens = seq.header.split()
        peptide = htokens[0]
        gene = htokens[3]
        assert gene.startswith('gene:'), 'waldo.sequences.load'
        gene = gene[len('gene:'):]
        session.add(
            Translation(
                'ensembl:gene_id',
                gene,
                'ensembl:peptide_id',
                peptide))
        aaseq = seq.sequence
        seq = models.EnsemblSequence(peptide, aaseq)
        session.add(seq)
        session.commit()
        nr_loaded += 1
    return nr_loaded
Example #15
0
def load(datadir, create_session=None):
    '''
    nr_entries = load(datadir, create_session={backend.create_session})

    Load MGI GO SLIM file

    Parameters
    ----------
      datadir : Directory containing GO files
      create_session : a callable object that returns an sqlalchemy session

    Returns
    -------
      nr_entries : Nr of entries
    '''
    from waldo.backend import call_create_session
    session = call_create_session(create_session)
    filename = path.join(datadir, _inputfilename)
    input = open(filename)

    input.readline() # header
    aspects = {}
    slimset = SlimSet("mgi")
    session.add(slimset)
    loaded = 0
    for line in input:
        go_id,_,slim_id,_ = line.strip().split('\t')
        if slim_id not in aspects:
            term = SlimTerm(slim_id, "mgi")
            session.add(term)
            session.commit()
            aspects[slim_id] = term
        else:
            term = aspects[slim_id]
        mapping = SlimMapping(go_id, term.id)
        session.add(mapping)
        session.commit()
        loaded += 1
    return loaded, len(aspects)