コード例 #1
0
def UpdateHash(con, cur, seq_id, hash_seq_full, hash_seq_150, hash_seq_100):
    '''
    update hash information

    Parameters
    ----------
    con,cur
    seq_id
    hash_seq_full - hash for full
    hash_seq_150 - hash for first 150 characters
    hash_seq_100 - hash for first 100 characters

    Returns
    -------
    true or false
    '''
    debug(1, 'UpdateHash')

    try:
        cur.execute(
            "update annotationschematest.sequencestable set hashfull='%s',hash150='%s',hash100='%s' where id=%s"
            % (hash_seq_full, hash_seq_150, hash_seq_100, seq_id))
        con.commit()
        return True
    except:
        return False
コード例 #2
0
def AddSequenceTax(con, cur, seq_id, col, value):
    '''
    update taxonomy record value

    Parameters
    ----------
    con,cur
    seq_id
    col - taxonomyrank coloumn name
    value - taxonomyrank value

    Returns
    -------
    true or false
    '''
    debug(1, 'GetSequenceStrByID')

    try:
        cur.execute(
            "update annotationschematest.sequencestable set %s='%s' where id=%s"
            % (col, value, seq_id))
        con.commit()
        return True
    except:
        return False
コード例 #3
0
def GetSequenceWithNoHashID(con, cur):
    '''
    Get sequence with no hash value (if any)

    Parameters
    ----------
    con,cur

    Returns
    -------
    sequence id : return the sequence id
    '''
    debug(1, 'GetSequenceWithNoHashID')

    cur.execute(
        "select id from annotationschematest.sequencestable where (COALESCE(hashfull,'')='' AND COALESCE(hash150,'')='' AND COALESCE(hash100,'')='') limit 1"
    )
    if cur.rowcount == 0:
        errmsg = 'no missing hash'
        debug(1, errmsg)
        return errmsg, -1
    res = cur.fetchone()
    return_id = res[0]

    return '', return_id
コード例 #4
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def get_primers(con, cur):
    '''Get information about all the sequencing primers used in dbbact

    Returns
    -------
    primers: list of dict of {
        'primerid': int
            dbbact internal id of the primer region (i.e. 1 for v4, etc.)
        'name': str,
            name of the primer region (i.e. 'v4', 'its1', etc.)
        'fprimer': str
        'rprimer: str
            name of the forward and reverse primers for the region (i.e. 515f, etc.)
    '''
    debug(1, 'get_primers')

    primers = []
    cur.execute('SELECT id, regionname, forwardprimer, reverseprimer FROM PrimersTable')
    res = cur.fetchall()
    for cres in res:
        cprimer = {}
        cprimer['primerid'] = cres[0]
        cprimer['name'] = cres[1]
        cprimer['fprimer'] = cres[2]
        cprimer['rprimer'] = cres[3]
        primers.append(cprimer)
    debug(1, 'found %d primers' % len(primers))
    return '', primers
コード例 #5
0
def SequencesWholeToFile(con, cur, fileName, dbid):
    '''
    Save list of sequences to file, this will be used later 'whole' ids script

    Parameters
    ----------
    con,cur
    fileName - output file name
    dbid - type of db (e.g. silva)

    Returns
    -------
    error message
    '''
    debug(1, 'SequencesWholeToFile')

    try:
        # cur.execute("SELECT id,sequence,ggid FROM sequencestable")
        cur.execute(
            "SELECT id,sequence,ggid FROM sequencestable where id not in (select distinct dbbactid from wholeseqidstable where dbid=%s)"
            % dbid)

        seq_count = 0
        with open(fileName, 'w') as fl:
            for cres in cur:
                fl.write('>%s\n%s\n' % (cres[0], cres[1]))
                seq_count += 1
    except psycopg2.DatabaseError as e:
        debug(7, 'database error %s' % e)
        return "database error %s" % e
    return ''
コード例 #6
0
def GetSequenceStrByID(con, cur, seq_id):
    '''
    Get sequence from seqid

    Parameters
    ----------
    con,cur
    seq_id: int
        the dbbact seqid

    Returns
    -------
    sequence str : return the sequence str
    '''
    debug(1, 'GetSequenceStrByID')

    cur.execute(
        "select sequence from annotationschematest.sequencestable where id=%s"
        % seq_id)
    if cur.rowcount == 0:
        errmsg = 'no sequeence for seqid %s' % seq_id
        debug(1, errmsg)
        return errmsg, seq_id
    res = cur.fetchone()
    return_id = res[0]

    return '', return_id
コード例 #7
0
def get_taxonomy_seqids(con, cur, taxonomy, userid=None):
    '''Get a list of all dbbact sequences containing the taxonomy as substring of the dbbact taxonomy

    Parameters
    ----------
    con,cur
    taxonomy : str
        the taxonomy substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    list of int
        The sequenceids for all sequences containing the taxonomy
    '''
    taxonomy = taxonomy.lower()
    taxStr = taxonomy
    debug(1, 'GetTaxonomyAnnotationIDS for taxonomy %s' % taxonomy)
    cur.execute(
        'SELECT id from SequencesTable where (taxrootrank ILIKE %s OR taxdomain ILIKE %s OR taxphylum ILIKE %s OR taxclass ILIKE %s OR taxfamily ILIKE %s OR taxgenus ILIKE %s OR taxorder ILIKE %s)',
        [taxStr, taxStr, taxStr, taxStr, taxStr, taxStr, taxStr])
    res = cur.fetchall()
    seqids = []
    for cres in res:
        seqids.append(cres[0])
    debug(1, 'found %d matching sequences for the taxonomy' % len(seqids))
    return seqids
コード例 #8
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def get_taxonomy_seqids(con, cur, taxonomy, userid=None):
    '''Get a list of all dbbact sequences containing the taxonomy as substring of the dbbact taxonomy

    Parameters
    ----------
    con,cur
    taxonomy : str
        the taxonomy substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    list of int
        The sequenceids for all sequences containing the taxonomy
    '''
    taxonomy = taxonomy.lower()
    taxStr = taxonomy
    debug(1, 'GetTaxonomyAnnotationIDS for taxonomy %s' % taxonomy)
    cur.execute('SELECT id from SequencesTable where (taxrootrank ILIKE %s OR taxdomain ILIKE %s OR taxphylum ILIKE %s OR taxclass ILIKE %s OR taxfamily ILIKE %s OR taxgenus ILIKE %s OR taxorder ILIKE %s)', [taxStr, taxStr, taxStr, taxStr, taxStr, taxStr, taxStr])
    res = cur.fetchall()
    seqids = []
    for cres in res:
        seqids.append(cres[0])
    debug(1, 'found %d matching sequences for the taxonomy' % len(seqids))
    return seqids
コード例 #9
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def SequencesWholeToFile(con, cur, fileName, dbid):
    '''
    Save list of sequences to file, this will be used later 'whole' ids script

    Parameters
    ----------
    con,cur
    fileName - output file name
    dbid - type of db (e.g. silva)

    Returns
    -------
    error message
    '''
    debug(1, 'SequencesWholeToFile')

    try:
        # cur.execute("SELECT id,sequence,ggid FROM sequencestable")
        cur.execute("SELECT id,sequence,ggid FROM sequencestable where id not in (select distinct dbbactid from wholeseqidstable where dbid=%s)" % dbid)

        seq_count = 0
        with open(fileName, 'w') as fl:
            for cres in cur:
                fl.write('>%s\n%s\n' % (cres[0], cres[1]))
                seq_count += 1
    except psycopg2.DatabaseError as e:
        debug(7, 'database error %s' % e)
        return "database error %s" % e
    return ''
コード例 #10
0
def get_primers(con, cur):
    '''Get information about all the sequencing primers used in dbbact

    Returns
    -------
    primers: list of dict of {
        'primerid': int
            dbbact internal id of the primer region (i.e. 1 for v4, etc.)
        'name': str,
            name of the primer region (i.e. 'v4', 'its1', etc.)
        'fprimer': str
        'rprimer: str
            name of the forward and reverse primers for the region (i.e. 515f, etc.)
    '''
    debug(1, 'get_primers')

    primers = []
    cur.execute(
        'SELECT id, regionname, forwardprimer, reverseprimer FROM PrimersTable'
    )
    res = cur.fetchall()
    for cres in res:
        cprimer = {}
        cprimer['primerid'] = cres[0]
        cprimer['name'] = cres[1]
        cprimer['fprimer'] = cres[2]
        cprimer['rprimer'] = cres[3]
        primers.append(cprimer)
    debug(1, 'found %d primers' % len(primers))
    return '', primers
コード例 #11
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def AddWholeSeqId(con, cur, dbidVal, dbbactidVal, wholeseqidVal, noTest=False):
    '''
    Add record to wholeseqidstable table

    Parameters
    ----------
    con,cur
    dbidVal - db type (e.g. silva, gg)
    dbbactidVal - sequnence id in dbbact
    wholeseqidVal - the id in different db (e.g. silva, gg)

    Returns
    -------
    error message
    '''
    debug(1, 'AddWholeSeqId')

    try:
        if noTest is True:
            cur.execute('INSERT INTO wholeseqidstable (dbid, dbbactid, wholeseqid) VALUES (%s, %s, %s)', [dbidVal, dbbactidVal, wholeseqidVal])
        else:
            err, existFlag = WholeSeqIdExists(con, cur, dbidVal, dbbactidVal, 'na')
            if existFlag is False:
                cur.execute('INSERT INTO wholeseqidstable (dbid, dbbactid, wholeseqid) VALUES (%s, %s, %s)', [dbidVal, dbbactidVal, wholeseqidVal])
            else:
                cur.execute('UPDATE wholeseqidstable set wholeseqid = %s where (dbid = %s and dbbactid = %s)', [wholeseqidVal, dbidVal, dbbactidVal])
        con.commit()
    except psycopg2.DatabaseError as e:
        debug(7, 'database error %s' % e)
        return "database error %s" % e
    return ""
コード例 #12
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def WholeSeqIdExists(con, cur, dbidVal, dbbactidVal, wholeseqidVal=''):
    '''
    Check if record is already exist in wholeseqidstable table

    Parameters
    ----------
    con,cur
    dbidVal - db type (e.g. silva, gg)
    dbbactidVal - sequnence id in dbbact
    wholeseqidVal - the id in different db (e.g. silva, gg)
    if empty we will retrive all the ids which have at list one record

    Returns
    -------
    True if exist
    error message
    '''
    debug(1, 'WholeSeqIdExists')

    try:
        if wholeseqidVal:
            cur.execute("SELECT * FROM wholeseqidstable where dbid = %s and dbbactid = %s and wholeseqid = %s ", [dbidVal, dbbactidVal, wholeseqidVal])
        else:
            cur.execute("SELECT * FROM wholeseqidstable where dbid = %s and dbbactid = %s and wholeseqid != 'na'", [dbidVal, dbbactidVal])
        if cur.rowcount > 0:
            return "", True
        else:
            return "", False

    except psycopg2.DatabaseError as e:
        debug(7, 'database error %s' % e)
        return "database error %s" % e, False
    return "", False
コード例 #13
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def GetSequenceStrByID(con, cur, seq_id):
    '''
    Get sequence from seqid

    Parameters
    ----------
    con,cur
    seq_id: int
        the dbbact seqid

    Returns
    -------
    sequence str : return the sequence str
    '''
    debug(1, 'GetSequenceStrByID')

    cur.execute("select sequence from annotationschematest.sequencestable where id=%s" % seq_id)
    if cur.rowcount == 0:
        errmsg = 'no sequeence for seqid %s' % seq_id
        debug(1, errmsg)
        return errmsg, seq_id
    res = cur.fetchone()
    return_id = res[0]

    return '', return_id
コード例 #14
0
def GetSequenceWithNoTaxonomyID(con, cur):
    '''
    Get sequence with no taxonomy (if any)

    Parameters
    ----------
    con,cur

    Returns
    -------
    sequence id : return the sequence id
    '''
    debug(1, 'GetSequenceWithNoTaxonomy')

    cur.execute(
        "select id from annotationschematest.sequencestable where (COALESCE(taxrootrank,'')='' AND COALESCE(taxdomain,'')='' AND COALESCE(taxphylum,'')='' AND COALESCE(taxclass,'')='' AND COALESCE(taxfamily,'')='' AND COALESCE(taxgenus,'')='' AND COALESCE(taxorder,'')='') limit 1"
    )
    if cur.rowcount == 0:
        errmsg = 'no missing taxonomy'
        debug(1, errmsg)
        return errmsg, -1
    res = cur.fetchone()
    return_id = res[0]

    return '', return_id
コード例 #15
0
def GetGgAnnotationIDs(con, cur, gg_str, userid=None):
    '''
    Get annotationids for all annotations containing any sequence matching the gg id (substring)

    Parameters
    ----------
    con,cur
    gg : str
        the gg id substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    annotationids : list of (int, int) (annotationid, count)
        list containing the ids of all annotations that contain a sequence with the Hash and the count of number of sequences from the Hash in that annotation
    seqids : list of int
        list of the sequenceids that have this annotation
    '''
    gg_str = gg_str.lower()
    ggStr = gg_str
    debug(1, 'GetGgAnnotationIDs for gg %s' % gg_str)

    # cur.execute("SELECT id,sequence,ggid FROM sequencestable where id in (select distinct dbbactid from wholeseqidstable where dbid=2 and wholeseqid != 'na' and wholeseqid ILIKE %s)", [ggStr])

    # res = cur.fetchall()
    # seqids = []
    # seqnames = []
    # for cres in res:
    #     seqids.append(cres[0])
    #     seqnames.append(cres[1])
    # debug(1, 'found %d matching sequences for the gg' % len(seqids))
    err, seqids, seqnames = get_seqs_from_db_id(con, cur, 'gg', ggStr)
    if err != '':
        return err, [], [], []

    annotationids_dict = defaultdict(int)
    for cseq in seqids:
        cur.execute(
            'SELECT annotationid from sequencesAnnotationTable where seqid=%s',
            [cseq])
        res = cur.fetchall()
        for cres in res:
            annotationids_dict[cres[0]] += 1
    # NOTE: need to add user validation for the ids!!!!!!
    debug(1,
          'found %d unique annotations for the gg' % len(annotationids_dict))
    annotationids = []
    for k, v in annotationids_dict.items():
        annotationids.append((k, v))
    return '', annotationids, seqids, seqnames
コード例 #16
0
def GetSequenceTaxonomy(con, cur, sequence, region=None, userid=0):
    """
    Get taxonomy str for given sequence

    Parameters
    ----------
    con,cur :
    sequence : str ('ACGT')
        the sequence to search for in the database
    region : int (optional)
        None to not compare region, or the regionid the sequence is from
    userid : int (optional)
        the id of the user requesting the annotations. Private annotations with non-matching user will not be returned

    Returns
    -------
    err : str
        The error encountered or '' if ok
    taxonomy: str
        The taxonomy string (of format d__XXX;p__YYYY;...)
    """

    debug(1, 'GetSequenceTaxonomy sequence %s' % sequence)

    cseq = sequence.lower()
    cur.execute(
        "SELECT coalesce(taxdomain,''),coalesce(taxphylum,''),  coalesce(taxclass,''),coalesce(taxorder,''),coalesce(taxfamily,''), coalesce(taxgenus,'') as taxonomy_str FROM SequencesTable WHERE sequence=%s",
        [cseq])

    if cur.rowcount == 0:
        debug(1, 'taxonomy not found for sequence %s' % cseq)
        # ctaxinfo = {'taxonomy': 'NA'}
        # return '', ctaxinfo
        return '', 'NA'

    res = cur.fetchone()
    firstTax = True
    taxStr = ''
    list_of_pre_str = ["d__", "p__", "c__", "o__", "f__", "g__"]
    for idx, val in enumerate(list_of_pre_str):
        if res[idx]:
            if firstTax is False:
                taxStr += ';'
            taxStr += val + res[idx]
            firstTax = False

    # ctaxinfo = {'taxonomy': taxStr}
    # return '', ctaxinfo
    return '', taxStr
コード例 #17
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def GetGgAnnotationIDs(con, cur, gg_str, userid=None):
    '''
    Get annotationids for all annotations containing any sequence matching the gg id (substring)

    Parameters
    ----------
    con,cur
    gg : str
        the gg id substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    annotationids : list of (int, int) (annotationid, count)
        list containing the ids of all annotations that contain a sequence with the Hash and the count of number of sequences from the Hash in that annotation
    seqids : list of int
        list of the sequenceids that have this annotation
    '''
    gg_str = gg_str.lower()
    ggStr = gg_str
    debug(1, 'GetGgAnnotationIDs for gg %s' % gg_str)

    # cur.execute("SELECT id,sequence,ggid FROM sequencestable where id in (select distinct dbbactid from wholeseqidstable where dbid=2 and wholeseqid != 'na' and wholeseqid ILIKE %s)", [ggStr])

    # res = cur.fetchall()
    # seqids = []
    # seqnames = []
    # for cres in res:
    #     seqids.append(cres[0])
    #     seqnames.append(cres[1])
    # debug(1, 'found %d matching sequences for the gg' % len(seqids))
    err, seqids, seqnames = get_seqs_from_db_id(con, cur, 'gg', ggStr)
    if err != '':
        return err, [], [], []

    annotationids_dict = defaultdict(int)
    for cseq in seqids:
        cur.execute('SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq])
        res = cur.fetchall()
        for cres in res:
            annotationids_dict[cres[0]] += 1
    # NOTE: need to add user validation for the ids!!!!!!
    debug(1, 'found %d unique annotations for the gg' % len(annotationids_dict))
    annotationids = []
    for k, v in annotationids_dict.items():
        annotationids.append((k, v))
    return '', annotationids, seqids, seqnames
コード例 #18
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def GetSequenceTaxonomy(con, cur, sequence, region=None, userid=0):
    """
    Get taxonomy str for given sequence

    Parameters
    ----------
    con,cur :
    sequence : str ('ACGT')
        the sequence to search for in the database
    region : int (optional)
        None to not compare region, or the regionid the sequence is from
    userid : int (optional)
        the id of the user requesting the annotations. Private annotations with non-matching user will not be returned

    Returns
    -------
    err : str
        The error encountered or '' if ok
    taxonomy: str
        The taxonomy string (of format d__XXX;p__YYYY;...)
    """

    debug(1, 'GetSequenceTaxonomy sequence %s' % sequence)

    cseq = sequence.lower()
    cur.execute("SELECT coalesce(taxdomain,''),coalesce(taxphylum,''),  coalesce(taxclass,''),coalesce(taxorder,''),coalesce(taxfamily,''), coalesce(taxgenus,'') as taxonomy_str FROM SequencesTable WHERE sequence=%s", [cseq])

    if cur.rowcount == 0:
        debug(1, 'taxonomy not found for sequence %s' % cseq)
        # ctaxinfo = {'taxonomy': 'NA'}
        # return '', ctaxinfo
        return '', 'NA'

    res = cur.fetchone()
    firstTax = True
    taxStr = ''
    list_of_pre_str = ["d__", "p__", "c__", "o__", "f__", "g__"]
    for idx, val in enumerate(list_of_pre_str):
        if res[idx]:
            if firstTax is False:
                taxStr += ';'
            taxStr += val + res[idx]
            firstTax = False

    # ctaxinfo = {'taxonomy': taxStr}
    # return '', ctaxinfo
    return '', taxStr
コード例 #19
0
def get_annotaiton_parents():
    cur.execute('SELECT annotationdetail,ontology FROM AnnotationParentsTable WHERE idannotation=%s', [annotationid])
    if cur.rowcount == 0:
        errmsg = 'No Annotation Parents found for annotationid %d in AnnotationParentsTable' % annotationid
        debug(3, errmsg)
        return(errmsg, {})
    parents = {}
    res = cur.fetchall()
    for cres in res:
        cdetail = cres[0]
        conto = cres[1]
        if cdetail in parents:
            parents[cdetail].append(conto)
        else:
            parents[cdetail] = [conto]
    debug(1, 'found %d detail types' % len(parents))
    return '', parents
コード例 #20
0
def GetTaxonomyAnnotationIDs(con, cur, taxonomy, userid=None):
    '''
    Get annotationids for all annotations containing any sequence matching the taxonomy (substring)

    Parameters
    ----------
    con,cur
    taxonomy : str
        the taxonomy substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    annotationids : list of (int, int) (annotationid, count)
        list containing the ids of all annotations that contain a sequence with the taxonomy and the count of number of sequences from the taxonomy in that annotation
    seqids : list of int
        list of the sequenceids that have this annotation
    '''
    seqids = get_taxonomy_seqids(con, cur, taxonomy=taxonomy, userid=userid)
    # taxonomy = taxonomy.lower()
    # taxStr = taxonomy
    # debug(1, 'GetTaxonomyAnnotationIDS for taxonomy %s' % taxonomy)
    # cur.execute('SELECT id from SequencesTable where (taxrootrank ILIKE %s OR taxdomain ILIKE %s OR taxphylum ILIKE %s OR taxclass ILIKE %s OR taxfamily ILIKE %s OR taxgenus ILIKE %s OR taxorder ILIKE %s)', [taxStr, taxStr, taxStr, taxStr, taxStr, taxStr, taxStr])
    # res = cur.fetchall()
    # seqids = []
    # for cres in res:
    #     seqids.append(cres[0])
    # debug(1, 'found %d matching sequences for the taxonomy' % len(seqids))
    annotationids_dict = defaultdict(int)
    for cseq in seqids:
        cur.execute(
            'SELECT annotationid from sequencesAnnotationTable where seqid=%s',
            [cseq])
        res = cur.fetchall()
        for cres in res:
            annotationids_dict[cres[0]] += 1
    # NOTE: need to add user validation for the ids!!!!!!
    debug(
        1, 'found %d unique annotations for the taxonomy' %
        len(annotationids_dict))
    annotationids = []
    for k, v in annotationids_dict.items():
        annotationids.append((k, v))
    return '', annotationids, seqids
コード例 #21
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def GetTaxonomyAnnotationIDs(con, cur, taxonomy, userid=None):
    '''
    Get annotationids for all annotations containing any sequence matching the taxonomy (substring)

    Parameters
    ----------
    con,cur
    taxonomy : str
        the taxonomy substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    annotationids : list of (int, int) (annotationid, count)
        list containing the ids of all annotations that contain a sequence with the taxonomy and the count of number of sequences from the taxonomy in that annotation
    seqids : list of int
        list of the sequenceids that have this annotation
    '''
    seqids = get_taxonomy_seqids(con, cur, taxonomy=taxonomy, userid=userid)
    # taxonomy = taxonomy.lower()
    # taxStr = taxonomy
    # debug(1, 'GetTaxonomyAnnotationIDS for taxonomy %s' % taxonomy)
    # cur.execute('SELECT id from SequencesTable where (taxrootrank ILIKE %s OR taxdomain ILIKE %s OR taxphylum ILIKE %s OR taxclass ILIKE %s OR taxfamily ILIKE %s OR taxgenus ILIKE %s OR taxorder ILIKE %s)', [taxStr, taxStr, taxStr, taxStr, taxStr, taxStr, taxStr])
    # res = cur.fetchall()
    # seqids = []
    # for cres in res:
    #     seqids.append(cres[0])
    # debug(1, 'found %d matching sequences for the taxonomy' % len(seqids))
    annotationids_dict = defaultdict(int)
    for cseq in seqids:
        cur.execute('SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq])
        res = cur.fetchall()
        for cres in res:
            annotationids_dict[cres[0]] += 1
    # NOTE: need to add user validation for the ids!!!!!!
    debug(1, 'found %d unique annotations for the taxonomy' % len(annotationids_dict))
    annotationids = []
    for k, v in annotationids_dict.items():
        annotationids.append((k, v))
    return '', annotationids, seqids
コード例 #22
0
def GetSequenceIdFromGG(con, cur, ggid):
    '''
    Get the sequence id for a given greengenes id (from rep. set 97%)

    Parameters
    ----------
    con,cur : database connection and cursor
    ggid : int
        The greengenes (rep_set 97%) identifier of the sequence

    Returns
    -------
    errmsg : str
        "" if ok, error msg if error encountered
    sid : list of int
        the ids of the matching sequences (empty tuple if not found)
        Note: can be more than one as several dbbact sequences can map to same ggid
    '''
    sid = []

    debug(1, 'get id for ggid %d' % ggid)
    cur.execute('SELECT id FROM SequencesTable WHERE ggid=%s', [ggid])
    if cur.rowcount == 0:
        errmsg = 'ggid %s not found in database' % ggid
        debug(1, errmsg)
        return errmsg, sid

    res = cur.fetchall()
    for cres in res:
        resid = cres[0]
        sid.append(resid)

    debug(1, 'found %d sequences for ggid %d' % (len(sid), ggid))
    return '', sid
コード例 #23
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def GetSequenceIdFromGG(con, cur, ggid):
    '''
    Get the sequence id for a given greengenes id (from rep. set 97%)

    Parameters
    ----------
    con,cur : database connection and cursor
    ggid : int
        The greengenes (rep_set 97%) identifier of the sequence

    Returns
    -------
    errmsg : str
        "" if ok, error msg if error encountered
    sid : list of int
        the ids of the matching sequences (empty tuple if not found)
        Note: can be more than one as several dbbact sequences can map to same ggid
    '''
    sid = []

    debug(1, 'get id for ggid %d' % ggid)
    cur.execute('SELECT id FROM SequencesTable WHERE ggid=%s', [ggid])
    if cur.rowcount == 0:
        errmsg = 'ggid %s not found in database' % ggid
        debug(1, errmsg)
        return errmsg, sid

    res = cur.fetchall()
    for cres in res:
        resid = cres[0]
        sid.append(resid)

    debug(1, 'found %d sequences for ggid %d' % (len(sid), ggid))
    return '', sid
コード例 #24
0
def get_seqs_from_db_id(con, cur, db_name, db_seq_id):
    '''Get all sequences that match the db_seq_id supplied for silva/greengenes

    Parameters
    ----------
    con, cur
    db_name: str
        name of the database from which the id originates. can be "silva" or "gg"
    db_seq_id: str
        the sequence identifier in the database (i.e. 'FJ978486.1.1387' for silva or '1111883' for greengenes)

    Returns
    -------
    error: str or '' if ok
    list of int
        the dbbact ids for all the dbbact sequences matching the db_seq_id
    list of str
        the actual sequences for the dbbact sequences matching the db_seq_id (same order)
    '''
    database_ids = {'silva': 1, 'gg': 2}
    if db_name in database_ids:
        db_id = database_ids[db_name]
    else:
        err = 'database id %s not found. options are: %s' % database_ids.keys()
        debug(9, err)
        return err, [], []
    db_seq_id = db_seq_id.lower()
    cur.execute(
        "SELECT id,sequence FROM SequencesTable where id in (select distinct dbbactid from WholeSeqIDsTable where WholeSeqID=%s AND dbid=%s)",
        [db_seq_id, db_id])
    seq_ids = []
    sequences = []
    res = cur.fetchall()
    for cres in res:
        seq_ids.append(cres[0])
        sequences.append(cres[1])
    debug(1,
          'found %d dbbact sequences for seqid %s' % (len(seq_ids), db_seq_id))
    return '', seq_ids, sequences
コード例 #25
0
def AddWholeSeqId(con, cur, dbidVal, dbbactidVal, wholeseqidVal, noTest=False):
    '''
    Add record to wholeseqidstable table

    Parameters
    ----------
    con,cur
    dbidVal - db type (e.g. silva, gg)
    dbbactidVal - sequnence id in dbbact
    wholeseqidVal - the id in different db (e.g. silva, gg)

    Returns
    -------
    error message
    '''
    debug(1, 'AddWholeSeqId')

    try:
        if noTest is True:
            cur.execute(
                'INSERT INTO wholeseqidstable (dbid, dbbactid, wholeseqid) VALUES (%s, %s, %s)',
                [dbidVal, dbbactidVal, wholeseqidVal])
        else:
            err, existFlag = WholeSeqIdExists(con, cur, dbidVal, dbbactidVal,
                                              'na')
            if existFlag is False:
                cur.execute(
                    'INSERT INTO wholeseqidstable (dbid, dbbactid, wholeseqid) VALUES (%s, %s, %s)',
                    [dbidVal, dbbactidVal, wholeseqidVal])
            else:
                cur.execute(
                    'UPDATE wholeseqidstable set wholeseqid = %s where (dbid = %s and dbbactid = %s)',
                    [wholeseqidVal, dbidVal, dbbactidVal])
        con.commit()
    except psycopg2.DatabaseError as e:
        debug(7, 'database error %s' % e)
        return "database error %s" % e
    return ""
コード例 #26
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def GetSequenceWithNoTaxonomyID(con, cur):
    '''
    Get sequence with no taxonomy (if any)

    Parameters
    ----------
    con,cur

    Returns
    -------
    sequence id : return the sequence id
    '''
    debug(1, 'GetSequenceWithNoTaxonomy')

    cur.execute("select id from annotationschematest.sequencestable where (COALESCE(taxrootrank,'')='' AND COALESCE(taxdomain,'')='' AND COALESCE(taxphylum,'')='' AND COALESCE(taxclass,'')='' AND COALESCE(taxfamily,'')='' AND COALESCE(taxgenus,'')='' AND COALESCE(taxorder,'')='') limit 1")
    if cur.rowcount == 0:
        errmsg = 'no missing taxonomy'
        debug(1, errmsg)
        return errmsg, -1
    res = cur.fetchone()
    return_id = res[0]

    return '', return_id
コード例 #27
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def GetSequenceWithNoHashID(con, cur):
    '''
    Get sequence with no hash value (if any)

    Parameters
    ----------
    con,cur

    Returns
    -------
    sequence id : return the sequence id
    '''
    debug(1, 'GetSequenceWithNoHashID')

    cur.execute("select id from annotationschematest.sequencestable where (COALESCE(hashfull,'')='' AND COALESCE(hash150,'')='' AND COALESCE(hash100,'')='') limit 1")
    if cur.rowcount == 0:
        errmsg = 'no missing hash'
        debug(1, errmsg)
        return errmsg, -1
    res = cur.fetchone()
    return_id = res[0]

    return '', return_id
コード例 #28
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def AddSequenceTax(con, cur, seq_id, col, value):
    '''
    update taxonomy record value

    Parameters
    ----------
    con,cur
    seq_id
    col - taxonomyrank coloumn name
    value - taxonomyrank value

    Returns
    -------
    true or false
    '''
    debug(1, 'GetSequenceStrByID')

    try:
        cur.execute("update annotationschematest.sequencestable set %s='%s' where id=%s" % (col, value, seq_id))
        con.commit()
        return True
    except:
        return False
コード例 #29
0
def WholeSeqIdExists(con, cur, dbidVal, dbbactidVal, wholeseqidVal=''):
    '''
    Check if record is already exist in wholeseqidstable table

    Parameters
    ----------
    con,cur
    dbidVal - db type (e.g. silva, gg)
    dbbactidVal - sequnence id in dbbact
    wholeseqidVal - the id in different db (e.g. silva, gg)
    if empty we will retrive all the ids which have at list one record

    Returns
    -------
    True if exist
    error message
    '''
    debug(1, 'WholeSeqIdExists')

    try:
        if wholeseqidVal:
            cur.execute(
                "SELECT * FROM wholeseqidstable where dbid = %s and dbbactid = %s and wholeseqid = %s ",
                [dbidVal, dbbactidVal, wholeseqidVal])
        else:
            cur.execute(
                "SELECT * FROM wholeseqidstable where dbid = %s and dbbactid = %s and wholeseqid != 'na'",
                [dbidVal, dbbactidVal])
        if cur.rowcount > 0:
            return "", True
        else:
            return "", False

    except psycopg2.DatabaseError as e:
        debug(7, 'database error %s' % e)
        return "database error %s" % e, False
    return "", False
コード例 #30
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def get_seqs_from_db_id(con, cur, db_name, db_seq_id):
    '''Get all sequences that match the db_seq_id supplied for silva/greengenes

    Parameters
    ----------
    con, cur
    db_name: str
        name of the database from which the id originates. can be "silva" or "gg"
    db_seq_id: str
        the sequence identifier in the database (i.e. 'FJ978486.1.1387' for silva or '1111883' for greengenes)

    Returns
    -------
    error: str or '' if ok
    list of int
        the dbbact ids for all the dbbact sequences matching the db_seq_id
    list of str
        the actual sequences for the dbbact sequences matching the db_seq_id (same order)
    '''
    database_ids = {'silva': 1, 'gg': 2}
    if db_name in database_ids:
        db_id = database_ids[db_name]
    else:
        err = 'database id %s not found. options are: %s' % database_ids.keys()
        debug(9, err)
        return err, [], []
    db_seq_id = db_seq_id.lower()
    cur.execute("SELECT id,sequence FROM SequencesTable where id in (select distinct dbbactid from WholeSeqIDsTable where WholeSeqID=%s AND dbid=%s)", [db_seq_id, db_id])
    seq_ids = []
    sequences = []
    res = cur.fetchall()
    for cres in res:
        seq_ids.append(cres[0])
        sequences.append(cres[1])
    debug(1, 'found %d dbbact sequences for seqid %s' % (len(seq_ids), db_seq_id))
    return '', seq_ids, sequences
コード例 #31
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def UpdateHash(con, cur, seq_id, hash_seq_full, hash_seq_150, hash_seq_100):
    '''
    update hash information

    Parameters
    ----------
    con,cur
    seq_id
    hash_seq_full - hash for full
    hash_seq_150 - hash for first 150 characters
    hash_seq_100 - hash for first 100 characters

    Returns
    -------
    true or false
    '''
    debug(1, 'UpdateHash')

    try:
        cur.execute("update annotationschematest.sequencestable set hashfull='%s',hash150='%s',hash100='%s' where id=%s" % (hash_seq_full, hash_seq_150, hash_seq_100, seq_id))
        con.commit()
        return True
    except:
        return False
コード例 #32
0
def GetHashAnnotations(con, cur, hash_str, userid=None):
    '''
    Get annotations for all annotations containing any sequence matching the hash (substring)

    Parameters
    ----------
    con,cur
    taxonomy : str
        the hash substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    annotations : list of tuples (annotation, counts)
        list containing the details for all annotations that contain a sequence with the taxonomy
        annotation - (see dbannotations.GetAnnotationsFromID() )
        counts - the number of sequences from taxonomy appearing in this annotations
    seqids : list of int
        list of the sequenceids which have this taxonomy
    seqnames : list of sequence strings
    '''
    debug(1, 'GetHashAnnotations for hash %s' % hash_str)
    # get the annotation ids
    err, annotationids, seqids, seqnames = GetHashAnnotationIDs(
        con, cur, hash_str, userid)
    if err:
        errmsg = 'Failed to get annotationIDs for hash_str %s: %s' % (hash_str,
                                                                      err)
        debug(6, errmsg)
        return errmsg, None
    # and get the annotation details for each
    annotations = []
    for cres in annotationids:
        cid = cres[0]
        ccount = cres[1]
        err, cdetails = dbbact.dbannotations.GetAnnotationsFromID(
            con, cur, cid)
        if err:
            debug(6, err)
            continue
        annotations.append((cdetails, ccount))
    debug(1, 'got %d details' % len(annotations))
    return '', annotations, seqids, seqnames
コード例 #33
0
def GetSilvaAnnotationIDs(con, cur, silva_str, userid=None):
    '''
    Get annotationids for all annotations containing any sequence matching the silva id (substring)

    Parameters
    ----------
    con,cur
    Silva : str
        the silva substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    err: str
        the error encountered or '' if successful
    annotationids : list of (int, int) (annotationid, count)
        list containing the ids of all annotations that contain a sequence with the silvaID and the count of number of sequences with the silvaID in that annotation
    seqids : list of int
        list of the sequenceids that have this silvaID
    seqnames: list of str
        the sequences matching the silvaID
    '''
    debug(1, 'GetSilvaAnnotationIDs for Silva %s' % silva_str)

    err, seqids, seqnames = get_seqs_from_db_id(con, cur, 'silva', silva_str)
    if err != '':
        return err, [], [], []
    # cur.execute("SELECT id,sequence FROM sequencestable where id in (select distinct dbbactid from wholeseqidstable where dbid=1 and wholeseqid != 'na' and wholeseqid ILIKE %s)", [silvaStr])
    # res = cur.fetchall()
    # seqids = []
    # seqnames = []
    # for cres in res:
    #     seqids.append(cres[0])
    #     seqnames.append(cres[1])
    debug(1, 'found %d matching sequences for the silva' % len(seqids))
    annotationids_dict = defaultdict(int)
    for cseq in seqids:
        cur.execute(
            'SELECT annotationid from sequencesAnnotationTable where seqid=%s',
            [cseq])
        res = cur.fetchall()
        for cres in res:
            annotationids_dict[cres[0]] += 1
    # NOTE: need to add user validation for the ids!!!!!!
    debug(
        1,
        'found %d unique annotations for the Silva' % len(annotationids_dict))
    annotationids = []
    for k, v in annotationids_dict.items():
        annotationids.append((k, v))
    return '', annotationids, seqids, seqnames
コード例 #34
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def GetSilvaAnnotations(con, cur, silva_str, userid=None):
    '''
    Get annotations for all annotations containing any sequence matching the silvaID (substring)

    Parameters
    ----------
    con,cur
    silva_str : str
        the silva id substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    annotations : list of tuples (annotation, counts)
        list containing the details for all annotations that contain a sequence with the taxonomy
        annotation - (see dbannotations.GetAnnotationsFromID() )
        counts - the number of sequences from taxonomy appearing in this annotations
    seqids : list of int
        list of the sequenceids which have this taxonomy
    seqnames : list of sequence strings
    '''
    debug(1, 'GetSilvaAnnotations for silva ID %s' % silva_str)
    # get the annotation ids
    err, annotationids, seqids, seqnames = GetSilvaAnnotationIDs(con, cur, silva_str, userid)
    if err:
        errmsg = 'Failed to get annotationIDs for silva_str %s: %s' % (silva_str, err)
        debug(6, errmsg)
        return errmsg, None
    # and get the annotation details for each
    annotations = []
    for cres in annotationids:
        cid = cres[0]
        ccount = cres[1]
        err, cdetails = dbannotations.GetAnnotationsFromID(con, cur, cid)
        if err:
            debug(6, err)
            continue
        annotations.append((cdetails, ccount))
    debug(1, 'got %d details' % len(annotations))
    return '', annotations, seqids, seqnames
コード例 #35
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def GetSilvaAnnotationIDs(con, cur, silva_str, userid=None):
    '''
    Get annotationids for all annotations containing any sequence matching the silva id (substring)

    Parameters
    ----------
    con,cur
    Silva : str
        the silva substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    err: str
        the error encountered or '' if successful
    annotationids : list of (int, int) (annotationid, count)
        list containing the ids of all annotations that contain a sequence with the silvaID and the count of number of sequences with the silvaID in that annotation
    seqids : list of int
        list of the sequenceids that have this silvaID
    seqnames: list of str
        the sequences matching the silvaID
    '''
    debug(1, 'GetSilvaAnnotationIDs for Silva %s' % silva_str)

    err, seqids, seqnames = get_seqs_from_db_id(con, cur, 'silva', silva_str)
    if err != '':
        return err, [], [], []
    # cur.execute("SELECT id,sequence FROM sequencestable where id in (select distinct dbbactid from wholeseqidstable where dbid=1 and wholeseqid != 'na' and wholeseqid ILIKE %s)", [silvaStr])
    # res = cur.fetchall()
    # seqids = []
    # seqnames = []
    # for cres in res:
    #     seqids.append(cres[0])
    #     seqnames.append(cres[1])
    debug(1, 'found %d matching sequences for the silva' % len(seqids))
    annotationids_dict = defaultdict(int)
    for cseq in seqids:
        cur.execute('SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq])
        res = cur.fetchall()
        for cres in res:
            annotationids_dict[cres[0]] += 1
    # NOTE: need to add user validation for the ids!!!!!!
    debug(1, 'found %d unique annotations for the Silva' % len(annotationids_dict))
    annotationids = []
    for k, v in annotationids_dict.items():
        annotationids.append((k, v))
    return '', annotationids, seqids, seqnames
コード例 #36
0
def GetHashAnnotationIDs(con, cur, hash_str, userid=None):
    '''
    Get annotationids for all annotations containing any sequence matching the Hash (substring)

    Parameters
    ----------
    con,cur
    Hash : str
        the Hash substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    annotationids : list of (int, int) (annotationid, count)
        list containing the ids of all annotations that contain a sequence with the Hash and the count of number of sequences from the Hash in that annotation
    seqids : list of int
        list of the sequenceids that have this annotation
    '''
    hash_str = hash_str.lower()
    taxStr = hash_str
    debug(1, 'GetHashAnnotationIDS for Hash %s' % hash_str)
    cur.execute(
        'SELECT id,sequence from SequencesTable where (hashfull ILIKE %s or hash150 ILIKE %s or hash100 ILIKE %s)',
        [hash_str, hash_str, hash_str])
    res = cur.fetchall()
    seqids = []
    seqnames = []
    for cres in res:
        seqids.append(cres[0])
        seqnames.append(cres[1])
    debug(1, 'found %d matching sequences for the Hash' % len(seqids))
    annotationids_dict = defaultdict(int)
    for cseq in seqids:
        cur.execute(
            'SELECT annotationid from sequencesAnnotationTable where seqid=%s',
            [cseq])
        res = cur.fetchall()
        for cres in res:
            annotationids_dict[cres[0]] += 1
    # NOTE: need to add user validation for the ids!!!!!!
    debug(1,
          'found %d unique annotations for the Hash' % len(annotationids_dict))
    annotationids = []
    for k, v in annotationids_dict.items():
        annotationids.append((k, v))
    return '', annotationids, seqids, seqnames
コード例 #37
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def GetHashAnnotationIDs(con, cur, hash_str, userid=None):
    '''
    Get annotationids for all annotations containing any sequence matching the Hash (substring)

    Parameters
    ----------
    con,cur
    Hash : str
        the Hash substring to look for
    userid : int (optional)
        the userid of the querying user (to enable searching private annotations)

    Returns
    -------
    annotationids : list of (int, int) (annotationid, count)
        list containing the ids of all annotations that contain a sequence with the Hash and the count of number of sequences from the Hash in that annotation
    seqids : list of int
        list of the sequenceids that have this annotation
    '''
    hash_str = hash_str.lower()
    taxStr = hash_str
    debug(1, 'GetHashAnnotationIDS for Hash %s' % hash_str)
    cur.execute('SELECT id,sequence from SequencesTable where (hashfull ILIKE %s or hash150 ILIKE %s or hash100 ILIKE %s)', [hash_str, hash_str, hash_str])
    res = cur.fetchall()
    seqids = []
    seqnames = []
    for cres in res:
        seqids.append(cres[0])
        seqnames.append(cres[1])
    debug(1, 'found %d matching sequences for the Hash' % len(seqids))
    annotationids_dict = defaultdict(int)
    for cseq in seqids:
        cur.execute('SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq])
        res = cur.fetchall()
        for cres in res:
            annotationids_dict[cres[0]] += 1
    # NOTE: need to add user validation for the ids!!!!!!
    debug(1, 'found %d unique annotations for the Hash' % len(annotationids_dict))
    annotationids = []
    for k, v in annotationids_dict.items():
        annotationids.append((k, v))
    return '', annotationids, seqids, seqnames
コード例 #38
0
def hash_sequences(filename, short_len=100):
    '''hash all the sequences in a fasta file

	Parameters
	----------
	filename: str
		the fasta file

	Returns
	-------
	seq_hash: dict of {seq: seqid}
	seq_lens : list of int
		all the sequence lengths in the fasta file (so we can hash all the lengths in the queries)
	short_hash: dict of {short_seq: seq_hash dict}
	'''
    num_too_short = 0
    seq_hash = {}
    seq_lens = set()
    all_ids = set()
    short_hash = defaultdict(dict)
    for cseq, chead in iter_fasta_seqs(filename):
        all_ids.add(chead)

        clen = len(cseq)
        if clen < short_len:
            num_too_short += 1
            continue
        short_seq = cseq[:short_len]
        short_hash[short_seq][cseq] = chead
        if clen not in seq_lens:
            seq_lens.add(clen)
        seq_hash[cseq] = chead

    debug(2, 'processed %d sequences.' % len(seq_hash))
    debug(2, 'lens: %s' % seq_lens)
    debug(2, 'num too short: %d' % num_too_short)
    return all_ids, seq_hash, seq_lens, short_hash
コード例 #39
0
ファイル: Update_Gg.py プロジェクト: amnona/supercooldb
def hash_sequences(filename, short_len=100):
	'''hash all the sequences in a fasta file

	Parameters
	----------
	filename: str
		the fasta file

	Returns
	-------
	seq_hash: dict of {seq: seqid}
	seq_lens : list of int
		all the sequence lengths in the fasta file (so we can hash all the lengths in the queries)
	short_hash: dict of {short_seq: seq_hash dict}
	'''
	num_too_short = 0
	seq_hash = {}
	seq_lens = set()
	all_ids = set()
	short_hash = defaultdict(dict)
	for cseq, chead in iter_fasta_seqs(filename):
		all_ids.add(chead)
		
		clen = len(cseq)
		if clen < short_len:
			num_too_short += 1
			continue
		short_seq = cseq[:short_len]
		short_hash[short_seq][cseq] = chead
		if clen not in seq_lens:
			seq_lens.add(clen)
		seq_hash[cseq] = chead
    
	debug(2,'processed %d sequences.' % len(seq_hash))
	debug(2,'lens: %s' % seq_lens)
	debug(2,'num too short: %d' % num_too_short)
	return all_ids, seq_hash, seq_lens, short_hash
コード例 #40
0
ファイル: Update_main.py プロジェクト: amnona/supercooldb
 		#Update silva
        summary_str += "Silva script started at :  " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n"
        main_func_silva()
        summary_str += "Silva script ended at :  " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n"

 		#Update tax
        summary_str += "Tax script started at :  " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n"
        main_func_tax()
        summary_str += "Tax script ended at :  " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n"        

 		#Update hash for sequence
        summary_str += "Seq hash script started at :  " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n"
        main_func_seq_hash()
        summary_str += "Seq hash script ended at :  " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n"        

        summary_str += "Sleep sleep at: " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n"

        maint_log += summary_str
 		##summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % (count_seq_failure,count_seq_success,count_seq_exist,count_seq_is_exist_failure,count_seq_dummy_failure,count_seq_dummy_success,count_seq_dummy_exist,count_seq_dummy_exist)
        saveStringToFile("maint_summary_log_" + date_time_str,summary_str)
        saveStringToFile("maint_log_" + date_time_str,maint_log)
        
        #Sleep until the next time
        debug(2, "go to sleep")
        time.sleep(sleep_time)


saveStringToFile("maint_summary_log_" + date_time_str,summary_str)
saveStringToFile("maint_log_" + date_time_str,maint_log)
コード例 #41
0
def main_func_seq_hash():
    SetDebugLevel(0)
    date_time_str = datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")
    
    
    #connect to the db
    con, cur = db_access.connect_db()
    
    debug(2, 'Started')
    if 'OPENU_FLAG' in os.environ:
        debug(2, 'Openu')
    else:
        debug(2, 'normal')
    
    
    count_success = 0
    count_failure = 0
    count_seq_success = 0
    count_seq_failure = 0
    count = 1
    hash_log = ""
    sleep_time = 86400
    
    hash_seq_full = ''
    hash_seq_150 = ''
    hash_seq_100 = ''
    
    while isFileExist("stop_seq_hash") == False:
        
        err, seq_id = dbsequences.GetSequenceWithNoHashID(con, cur)
        if err or seq_id == -1:
            #If no empty sequence, wait for long time
            debug(2, "go to sleep")
            hash_log += "sleep start " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n"
            saveStringToFile("hash_summary_log_sleep_" + date_time_str,"sleep started " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S"))
            #continue
            return # insted of sleep, one master file run all scripts
        
        hash_log += "sequence id = " + str(seq_id) + "\n"
        
        err, seq_str = dbsequences.GetSequenceStrByID(con, cur, seq_id)
        if err:
            tax_log += "Fatal Error, could not find sequence " + "\n"
            break
        
        hash_seq_full = 'na'
        hash_seq_150 = 'na'
        hash_seq_100 = 'na'
        
        seq_str = seq_str.upper()
        
        if len(seq_str) > 0  :
            hash_seq_full = hashlib.md5(seq_str.encode('utf-8')).hexdigest()
        
        if len(seq_str) >= 150 :
            hash_seq_150 = hashlib.md5(seq_str[:150].encode('utf-8')).hexdigest()
            
        if len(seq_str) >= 100  :
            hash_seq_100 = hashlib.md5(seq_str[:100].encode('utf-8')).hexdigest()
            
        
        hash_log += "id: " + str(seq_id) + "\n"
        hash_log += "hash: " + str(hash_seq_full) + "\n"
        hash_log += "hash 150: " + str(hash_seq_150) + "\n"
        hash_log += "hash 100: " + str(hash_seq_100) + "\n"
        
        has_failure = False
        if dbsequences.UpdateHash(con, cur, seq_id,hash_seq_full,hash_seq_150,hash_seq_100) == True:
            hash_log += " SUCCESS" + "\n"
            count_seq_success = count_seq_success + 1
        else:
            hash_log += " FAILED" + "\n"
            count_seq_failure = count_seq_failure + 1
            has_failure = True
                        
            
        if has_failure == True:
            count_failure = count_failure + 1
        else:
            count_success = count_success + 1
        
        
        summary_str = "count_seq_success = %s\ncount_seq_failure = %s\n" % (count_seq_success,count_seq_failure)
        
        saveStringToFile("hash_summary_log_" + date_time_str,summary_str)
        saveStringToFile("hash_log_" + date_time_str,hash_log)
        debug(2, 'found sequence %s' % seq_str)
        debug(2, 'return %s,%s,%s' % (hash_seq_full,hash_seq_150,hash_seq_100))
        count = count + 1
        
        #stop the script in case of error
        if count_failure > 0:
            break;
コード例 #42
0
        ).strftime("%Y-%m-%d--%H:%M:%S") + "\n"

        #Update tax
        summary_str += "Tax script started at :  " + datetime.datetime.now(
        ).strftime("%Y-%m-%d--%H:%M:%S") + "\n"
        main_func_tax()
        summary_str += "Tax script ended at :  " + datetime.datetime.now(
        ).strftime("%Y-%m-%d--%H:%M:%S") + "\n"

        #Update hash for sequence
        summary_str += "Seq hash script started at :  " + datetime.datetime.now(
        ).strftime("%Y-%m-%d--%H:%M:%S") + "\n"
        main_func_seq_hash()
        summary_str += "Seq hash script ended at :  " + datetime.datetime.now(
        ).strftime("%Y-%m-%d--%H:%M:%S") + "\n"

        summary_str += "Sleep sleep at: " + datetime.datetime.now().strftime(
            "%Y-%m-%d--%H:%M:%S") + "\n"

        maint_log += summary_str
        ##summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % (count_seq_failure,count_seq_success,count_seq_exist,count_seq_is_exist_failure,count_seq_dummy_failure,count_seq_dummy_success,count_seq_dummy_exist,count_seq_dummy_exist)
        saveStringToFile("maint_summary_log_" + date_time_str, summary_str)
        saveStringToFile("maint_log_" + date_time_str, maint_log)

        #Sleep until the next time
        debug(2, "go to sleep")
        time.sleep(sleep_time)

saveStringToFile("maint_summary_log_" + date_time_str, summary_str)
saveStringToFile("maint_log_" + date_time_str, maint_log)
コード例 #43
0
def AddSequences(con,
                 cur,
                 sequences,
                 taxonomies=None,
                 ggids=None,
                 primer='V4',
                 commit=True):
    """
    Add sequence entries to database if they do not exist yet
    input:
    con,cur : database connection and cursor
    sequences: list of str
        the sequences to add
    taxonomies: list of str (optional)
        taxonomy of each sequence or None to add NA
    ggids: list of int (optional)
        list of GreenGenes id for each sequence or None to add 0
    primer: str (optional)
        Name of the primer (from PrimersTable). default is V4
    commit : bool (optional)
        True (default) to commit, False to wait with the commit

    output:
    errmsg : str
        "" if ok, error msg if error encountered
    seqids : list of int or None
        list of the new ids or None if error enountered
    """
    # get the primer region id
    seqids = []
    numadded = 0
    idprimer = dbbact.primers.GetIdFromName(con, cur, primer)
    if idprimer < 0:
        debug(2, 'primer %s not found' % primer)
        return "primer %s not found" % primer, None
    debug(1, 'primerid %s' % idprimer)
    try:
        for idx, cseq in enumerate(sequences):
            if len(cseq) < SEED_SEQ_LEN:
                errmsg = 'sequence too short (<%d) for sequence %s' % (
                    SEED_SEQ_LEN, cseq)
                debug(4, errmsg)
                return errmsg, None
            # test if already exists, skip it
            err, cseqid = GetSequenceId(con,
                                        cur,
                                        sequence=cseq,
                                        idprimer=idprimer,
                                        no_shorter=True,
                                        no_longer=True)
            if len(cseqid) == 0:
                # not found, so need to add this sequence
                if taxonomies is None:
                    ctax = 'na'
                else:
                    ctax = taxonomies[idx].lower()
                if ggids is None:
                    cggid = 0
                else:
                    cggid = ggids[idx]
                cseq = cseq.lower()
                cseedseq = cseq[:SEED_SEQ_LEN]
                cur.execute(
                    'INSERT INTO SequencesTable (idPrimer,sequence,length,taxonomy,ggid,seedsequence) VALUES (%s,%s,%s,%s,%s,%s) RETURNING id',
                    [idprimer, cseq,
                     len(cseq), ctax, cggid, cseedseq])
                cseqid = cur.fetchone()
                numadded += 1
            if len(cseqid) > 1:
                debug(
                    8,
                    'AddSequences - Same sequence appears twice in database: %s'
                    % cseq)
            seqids.append(cseqid[0])
        if commit:
            con.commit()
        debug(3, "Added %d sequences (out of %d)" % (numadded, len(sequences)))
        return "", seqids
    except psycopg2.DatabaseError as e:
        debug(7, 'database error %s' % e)
        return "database error %s" % e, None
コード例 #44
0
def main_func_silva():

    SetDebugLevel(0)
    date_time_str = datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")

    #connect to the db
    con, cur = db_access.connect_db()

    debug(2, 'Started')
    if 'OPENU_FLAG' in os.environ:
        debug(2, 'Openu')
    else:
        debug(2, 'normal')

    count_success = 0
    count_failure = 0
    count_dummy_success = 0
    count_dummy_failure = 0
    count_seq_success = 0
    count_seq_failure = 0
    count_seq_is_exist_failure = 0
    count_seq_exist = 0

    count_seq_is_exist_dummy_failure = 0
    count_seq_dummy_exist = 0
    count_seq_dummy_failure = 0
    count_seq_dummy_success = 0

    count = 1
    hash_log = ""
    sleep_time = 86400
    #sleep_time = 10
    short_len = 150
    seqdbid = 1  # SILVA
    silva_log = ""

    tempFileName = 'tempSilvaScript.fasta'
    silvaFileName = 'SILVA_132_SSURef_tax_silva.fasta'

    while isFileExist("stop_silva") == False:

        #Create the file and read it
        dbsequences.SequencesWholeToFile(con, cur, tempFileName, seqdbid)
        all_ids, seq_hash, seq_lens, short_hash = hash_sequences(
            filename=tempFileName, short_len=150)

        #nothing to do, go to sleep
        if len(all_ids) == 0:
            debug(2, "go to sleep")
            silva_log += "sleep start " + datetime.datetime.now().strftime(
                "%Y-%m-%d--%H:%M:%S") + "\n"
            saveStringToFile(
                "silva_summary_log_sleep_" + date_time_str, "sleep started " +
                datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S"))
            #continue
            return  # insted of sleep, one master file run all scripts
        else:
            for seq_id in all_ids:
                err = dbsequences.AddWholeSeqId(con,
                                                cur,
                                                seqdbid,
                                                seq_id,
                                                'na',
                                                noTest=True)
                if err:
                    debug(2, "failed to add dummy")
                    silva_log += "failed to add\n"
                    count_seq_dummy_failure += 1
                else:
                    debug(2, "add dummy")
                    silva_log += "added\n"
                    count_seq_dummy_success += 1

        idx = 0
        num_matches = 0

        for cseq, chead in iter_fasta_seqs(silvaFileName):
            isFound = False
            idx += 1
            if idx % 1000 == 0:
                debug(2, "count: %d" % idx)
                summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % (
                    count_seq_failure, count_seq_success, count_seq_exist,
                    count_seq_is_exist_failure, count_seq_dummy_failure,
                    count_seq_dummy_success, count_seq_dummy_exist,
                    count_seq_dummy_exist)

                saveStringToFile("silva_summary_log_" + date_time_str,
                                 summary_str)
                saveStringToFile("silva_log_" + date_time_str, silva_log)

            for cpos in range(len(cseq) - short_len):
                ccseq = cseq[cpos:cpos + short_len]
                if ccseq in short_hash:

                    for k, v in short_hash[ccseq].items():
                        if k in cseq:
                            cid = chead.split(' ')[0]

                            # remove the tail from the id
                            split_cid = cid.split('.')
                            if len(split_cid) > 2:
                                cid = ".".join(split_cid[:-2])
                            else:
                                cid = ".".join(split_cid)
                            cid = cid.lower()

                            silva_log += "rec found: seq id %s , db bact id %s, id %s\n" % (
                                seqdbid, v, cid)

                            #check if already exist
                            err, existFlag = dbsequences.WholeSeqIdExists(
                                con, cur, seqdbid, v, cid)
                            if err:
                                count_seq_is_exist_failure += 1
                                silva_log += "failed to found\n"
                            if existFlag:
                                count_seq_exist += 1
                                silva_log += "found\n"
                                isFound = True
                                break
                            else:
                                debug(2, "add normal")
                                cid = cid.replace('.', '')
                                cid = cid.lower()
                                err = dbsequences.AddWholeSeqId(
                                    con, cur, seqdbid, v, cid)
                                if err:
                                    silva_log += "failed to add\n"
                                    count_seq_failure += 1
                                    break
                                else:
                                    silva_log += "added\n"
                                    count_seq_success += 1
                                    isFound = True
                                    break

        #go over all ids, if not exist add record
        for seq_id in all_ids:
            err, existFlag = dbsequences.WholeSeqIdExists(
                con, cur, seqdbid, seq_id)
            if err:
                count_seq_is_exist_dummy_failure += 1
                silva_log += "failed to found\n"
            if existFlag:
                count_seq_dummy_exist += 1
                silva_log += "found\n"
                isFound = True
                break
            else:
                debug(2, "add dummy")
                err = dbsequences.AddWholeSeqId(con, cur, seqdbid, seq_id,
                                                'na')
                if err:
                    silva_log += "failed to add\n"
                    count_seq_dummy_failure += 1
                    break
                else:
                    silva_log += "added\n"
                    count_seq_dummy_success += 1
                    break

        debug(2, 'done')

        summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % (
            count_seq_failure, count_seq_success, count_seq_exist,
            count_seq_is_exist_failure, count_seq_dummy_failure,
            count_seq_dummy_success, count_seq_dummy_exist,
            count_seq_dummy_exist)

        saveStringToFile("silva_summary_log_" + date_time_str, summary_str)
        saveStringToFile("silva_log_" + date_time_str, silva_log)
コード例 #45
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def AddSequences(con, cur, sequences, taxonomies=None, ggids=None, primer='V4', commit=True):
    """
    Add sequence entries to database if they do not exist yet
    input:
    con,cur : database connection and cursor
    sequences: list of str
        the sequences to add
    taxonomies: list of str (optional)
        taxonomy of each sequence or None to add NA
    ggids: list of int (optional)
        list of GreenGenes id for each sequence or None to add 0
    primer: str (optional)
        Name of the primer (from PrimersTable). default is V4
    commit : bool (optional)
        True (default) to commit, False to wait with the commit

    output:
    errmsg : str
        "" if ok, error msg if error encountered
    seqids : list of int or None
        list of the new ids or None if error enountered
    """
    # get the primer region id
    seqids = []
    numadded = 0
    idprimer = dbbact.primers.GetIdFromName(con, cur, primer)
    if idprimer < 0:
        debug(2, 'primer %s not found' % primer)
        return "primer %s not found" % primer, None
    debug(1, 'primerid %s' % idprimer)
    try:
        for idx, cseq in enumerate(sequences):
            if len(cseq) < SEED_SEQ_LEN:
                errmsg = 'sequence too short (<%d) for sequence %s' % (SEED_SEQ_LEN, cseq)
                debug(4, errmsg)
                return errmsg, None
            # test if already exists, skip it
            err, cseqid = GetSequenceId(con, cur, sequence=cseq, idprimer=idprimer, no_shorter=True, no_longer=True)
            if len(cseqid) == 0:
                # not found, so need to add this sequence
                if taxonomies is None:
                    ctax = 'na'
                else:
                    ctax = taxonomies[idx].lower()
                if ggids is None:
                    cggid = 0
                else:
                    cggid = ggids[idx]
                cseq = cseq.lower()
                cseedseq = cseq[:SEED_SEQ_LEN]
                cur.execute('INSERT INTO SequencesTable (idPrimer,sequence,length,taxonomy,ggid,seedsequence) VALUES (%s,%s,%s,%s,%s,%s) RETURNING id', [idprimer, cseq, len(cseq), ctax, cggid, cseedseq])
                cseqid = cur.fetchone()
                numadded += 1
            if len(cseqid) > 1:
                debug(8, 'AddSequences - Same sequence appears twice in database: %s' % cseq)
            seqids.append(cseqid[0])
        if commit:
            con.commit()
        debug(3, "Added %d sequences (out of %d)" % (numadded, len(sequences)))
        return "", seqids
    except psycopg2.DatabaseError as e:
        debug(7, 'database error %s' % e)
        return "database error %s" % e, None
コード例 #46
0
def GetSequenceId(con,
                  cur,
                  sequence,
                  idprimer=None,
                  no_shorter=False,
                  no_longer=False):
    """
    Get sequence ids for a sequence

    input:
    con,cur : database connection and cursor
    sequence : str (ACGT sequences)
    idprimer : int (optional)
        if supplied, verify the sequence is from this idPrimer
    no_shorter : bool (optional)
        False (default) to enable shorter db sequences matching sequence, True to require at least length of query sequence
    no_longer : bool (optional)
        False (default) to enable longer db sequences matching sequence, True to require at least length of database sequence

    output:
    errmsg : str
        "" if ok, error msg if error encountered
    sid : list of int
        the ids of the matching sequences (empty tuple if not found)
        Note: can be more than one as we also look for short subsequences / long supersequences
    """
    # check if the sequence is made only of digits assume it is a greengenes id
    if sequence.isdigit():
        debug(1, 'getting id for ggid %s' % sequence)
        return GetSequenceIdFromGG(con, cur, int(sequence))

    sid = []
    cseq = sequence.lower()
    if len(cseq) < SEED_SEQ_LEN:
        errmsg = 'sequence too short (<%d) for sequence %s' % (SEED_SEQ_LEN,
                                                               cseq)
        debug(4, errmsg)
        return errmsg, sid

    # look for all sequences matching the seed
    cseedseq = cseq[:SEED_SEQ_LEN]
    cur.execute('SELECT id,sequence FROM SequencesTable WHERE seedsequence=%s',
                [cseedseq])
    if cur.rowcount == 0:
        errmsg = 'sequence %s not found' % sequence
        debug(1, errmsg)
        return errmsg, sid

    cseqlen = len(cseq)
    res = cur.fetchall()
    for cres in res:
        resid = cres[0]
        resseq = cres[1]
        if no_shorter:
            if len(resseq) < cseqlen:
                continue
            comparelen = cseqlen
        else:
            comparelen = min(len(resseq), cseqlen)
        if no_longer:
            if len(resseq) > cseqlen:
                continue
        if cseq[:comparelen] == resseq[:comparelen]:
            if idprimer is None:
                sid.append(resid)
            cur.execute(
                'SELECT idPrimer FROM SequencesTable WHERE id=%s LIMIT 1',
                [resid])
            res = cur.fetchone()
            if res[0] == idprimer:
                sid.append(resid)
    if len(sid) == 0:
        errmsg = 'sequence %s not found' % sequence
        debug(1, errmsg)
        return errmsg, sid
    return '', sid
コード例 #47
0
ファイル: Update_Gg.py プロジェクト: amnona/supercooldb
def main_func_gg():
    SetDebugLevel(0)
    date_time_str = datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")
    
    #connect to the db
    con, cur = db_access.connect_db()
    
    debug(2, 'Started')
    if 'OPENU_FLAG' in os.environ:
        debug(2, 'Openu')
    else:
        debug(2, 'normal')
    
    
    count_success = 0
    count_failure = 0
    count_dummy_success = 0
    count_dummy_failure = 0
    count_seq_success = 0
    count_seq_failure = 0
    count_seq_is_exist_failure = 0
    count_seq_exist = 0
    
    count_seq_is_exist_dummy_failure = 0
    count_seq_dummy_exist = 0
    count_seq_dummy_failure = 0
    count_seq_dummy_success = 0
    
    count = 1
    hash_log = ""
    sleep_time = 86400
    #sleep_time = 10
    short_len=150
    seqdbid = 2 # GG
    gg_log = ""
    
    tempFileName = 'tempGgScript.fasta'
    #ggFileName = '/Volumes/Photos/Temporary Studies/gg_13_5.fasta'
    ggFileName = 'gg_13_5.fasta'
        
    while isFileExist("stop_gg") == False:
        
        #Create the file and read it
        dbsequences.SequencesWholeToFile(con, cur, tempFileName, seqdbid)    
        all_ids , seq_hash, seq_lens, short_hash = hash_sequences(filename=tempFileName, short_len=150)
        
        #nothing to do, go to sleep
        if len(all_ids) == 0:
            debug(2, "go to sleep")
            gg_log += "sleep start " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n"
            saveStringToFile("gg_summary_log_sleep_" + date_time_str,"sleep started " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S"))
            #time.sleep(sleep_time)
            #continue
            return # insted of sleep, one master file run all scripts
        else:
            for seq_id in all_ids:
                err = dbsequences.AddWholeSeqId(con,cur, seqdbid, seq_id, 'na', noTest = True)
                if err:
                    debug(2, "failed to add dummy")
                    gg_log += "failed to add\n"
                    count_seq_dummy_failure += 1 
                else:
                    debug(2, "add dummy")
                    gg_log += "added\n"
                    count_seq_dummy_success += 1
        
        idx = 0
        num_matches = 0
        
        for cseq, chead in iter_fasta_seqs(ggFileName):
            isFound = False 
            idx += 1
            if idx % 1000 == 0:
                debug(2, "count: %d"  % idx)
                summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % (count_seq_failure,count_seq_success,count_seq_exist,count_seq_is_exist_failure,count_seq_dummy_failure,count_seq_dummy_success,count_seq_dummy_exist,count_seq_dummy_exist)
        
                saveStringToFile("gg_summary_log_" + date_time_str,summary_str)
                saveStringToFile("gg_log_" + date_time_str,gg_log)
        
            for cpos in range(len(cseq) - short_len):
                ccseq = cseq[cpos:cpos + short_len]
                if ccseq in short_hash:
					
                    for k, v in short_hash[ccseq].items():
                        if k in cseq:
                            cid = chead.split(' ')[0]
                            
                            gg_log += "rec found: seq id %s , db bact id %s, id %s\n" % (seqdbid, v, cid)
                            
                            #check if already exist
                            err, existFlag = dbsequences.WholeSeqIdExists(con,cur, seqdbid, v, cid);
                            if err:
                                count_seq_is_exist_failure += 1 
                                gg_log += "failed to found\n"
                            if existFlag:
                                count_seq_exist += 1
                                gg_log += "found\n"
                                isFound = True
                                break
                            else:
                                debug(2, "add normal")
                                err = dbsequences.AddWholeSeqId(con,cur, seqdbid, v, cid)
                                if err:
                                    gg_log += "failed to add\n"
                                    count_seq_failure += 1 
                                    break
                                else:
                                    gg_log += "added\n"
                                    count_seq_success += 1
                                    isFound = True
                                    break
        
        
        #go over all ids, if not exist add record
        for seq_id in all_ids:
            err, existFlag = dbsequences.WholeSeqIdExists(con,cur, seqdbid, seq_id)
            if err:
                count_seq_is_exist_dummy_failure += 1 
                gg_log += "failed to found\n"
            if existFlag:
                count_seq_dummy_exist += 1
                gg_log += "found\n"
                isFound = True
                break
            else:
                debug(2, "add dummy")
                err = dbsequences.AddWholeSeqId(con,cur, seqdbid, seq_id, 'na')
                if err:
                    gg_log += "failed to add\n"
                    count_seq_dummy_failure += 1 
                    break
                else:
                    gg_log += "added\n"
                    count_seq_dummy_success += 1
                    break
            
        
        debug(2, 'done')
        
        summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % (count_seq_failure,count_seq_success,count_seq_exist,count_seq_is_exist_failure,count_seq_dummy_failure,count_seq_dummy_success,count_seq_dummy_exist,count_seq_dummy_exist)
        
        saveStringToFile("gg_summary_log_" + date_time_str,summary_str)
        saveStringToFile("gg_log_" + date_time_str,gg_log)
コード例 #48
0
def connect_db(servertype='main', schema='AnnotationSchemaTest'):
    """
    connect to the postgres database and return the connection and cursor
    input:
    servertype : str (optional)
        the database to access. options are:
            'main' (default) - the main remote production database
            'develop' - the remote development database
            'local' - a local postgres instance of the database
            'amnon' - the local mac installed veriosn of dbbact
    schema : str (optional)
        name of the schema containing the annotation database

    output:
    con : the database connection
    cur : the database cursor
    """
    debug(1, 'connecting to database')
    try:
        database = 'scdb'
        user = '******'
        password = '******'
        port = 5432
        host = 'localhost'
        if servertype == 'main':
            debug(1, 'servertype is main')
            database = 'scdb'
            user = '******'
            password = '******'
            port = 29546
        elif servertype == 'develop':
            debug(1, 'servertype is develop')
            database = 'scdb_develop'
            user = '******'
            password = '******'
            port = 29546
        elif servertype == 'local':
            debug(1, 'servertype is local')
            database = 'postgres'
            user = '******'
            password = '******'
            port = 5432
        elif servertype == 'amnon':
            debug(1, 'servertype is amnon')
            database = 'dbbact'
            user = '******'
            password = '******'
            port = 5432
        elif servertype == 'openu':
            debug(1, 'servertype is openu')
            database = 'scdb'
            user = '******'
            password = '******'
            port = 5432
        else:
            debug(6, 'unknown server type %s' % servertype)
            print('unknown server type %s' % servertype)
        if servertype == 'openu':
            debug(1, 'connecting database=%s, user=%s, port=%d' % (database, user, port))
            con = psycopg2.connect(database=database, user=user, password=password, port=port)
        else:
            debug(1, 'connecting host=%s, database=%s, user=%s, port=%d' % (host, database, user, port))
            con = psycopg2.connect(host=host, database=database, user=user, password=password, port=port)
        cur = con.cursor(cursor_factory=psycopg2.extras.DictCursor)
        cur.execute('SET search_path to %s' % schema)
        debug(1, 'connected to database')
        return (con, cur)
    except psycopg2.DatabaseError as e:
        print('Cannot connect to database. Error %s' % e)
        raise SystemError('Cannot connect to database. Error %s' % e)
        return None
コード例 #49
0
ファイル: dbsequences.py プロジェクト: amnona/supercooldb
def GetSequenceId(con, cur, sequence, idprimer=None, no_shorter=False, no_longer=False):
    """
    Get sequence ids for a sequence

    input:
    con,cur : database connection and cursor
    sequence : str (ACGT sequences)
    idprimer : int (optional)
        if supplied, verify the sequence is from this idPrimer
    no_shorter : bool (optional)
        False (default) to enable shorter db sequences matching sequence, True to require at least length of query sequence
    no_longer : bool (optional)
        False (default) to enable longer db sequences matching sequence, True to require at least length of database sequence

    output:
    errmsg : str
        "" if ok, error msg if error encountered
    sid : list of int
        the ids of the matching sequences (empty tuple if not found)
        Note: can be more than one as we also look for short subsequences / long supersequences
    """
    # check if the sequence is made only of digits assume it is a greengenes id
    if sequence.isdigit():
        debug(1, 'getting id for ggid %s' % sequence)
        return GetSequenceIdFromGG(con, cur, int(sequence))

    sid = []
    cseq = sequence.lower()
    if len(cseq) < SEED_SEQ_LEN:
        errmsg = 'sequence too short (<%d) for sequence %s' % (SEED_SEQ_LEN, cseq)
        debug(4, errmsg)
        return errmsg, sid

    # look for all sequences matching the seed
    cseedseq = cseq[:SEED_SEQ_LEN]
    cur.execute('SELECT id,sequence FROM SequencesTable WHERE seedsequence=%s', [cseedseq])
    if cur.rowcount == 0:
        errmsg = 'sequence %s not found' % sequence
        debug(1, errmsg)
        return errmsg, sid

    cseqlen = len(cseq)
    res = cur.fetchall()
    for cres in res:
        resid = cres[0]
        resseq = cres[1]
        if no_shorter:
            if len(resseq) < cseqlen:
                continue
            comparelen = cseqlen
        else:
            comparelen = min(len(resseq), cseqlen)
        if no_longer:
            if len(resseq) > cseqlen:
                continue
        if cseq[:comparelen] == resseq[:comparelen]:
            if idprimer is None:
                sid.append(resid)
            cur.execute('SELECT idPrimer FROM SequencesTable WHERE id=%s LIMIT 1', [resid])
            res = cur.fetchone()
            if res[0] == idprimer:
                sid.append(resid)
    if len(sid) == 0:
        errmsg = 'sequence %s not found' % sequence
        debug(1, errmsg)
        return errmsg, sid
    return '', sid
コード例 #50
0
def main_func_tax():
    SetDebugLevel(0)
    date_time_str = datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")

    #connect to the db
    con, cur = db_access.connect_db()

    debug(2, 'Started')
    if 'OPENU_FLAG' in os.environ:
        debug(2, 'Openu')
    else:
        debug(2, 'normal')

    rank_list = []
    rank_list.append("rootrank")
    rank_list.append("life")
    rank_list.append("domain")
    rank_list.append("kingdom")
    rank_list.append("phylum")
    rank_list.append("class")
    rank_list.append("order")
    rank_list.append("family")
    rank_list.append("genus")
    rank_list.append("species")

    count_success = 0
    count_failure = 0
    count_seq_success = 0
    count_seq_failure = 0
    count = 1
    tax_log = ""
    rdp_exe_location = "rdp_classifier_2.12/"
    sleep_time = 86400

    while isFileExist("stop_tax") == False:
        removeFile("%sinput" % rdp_exe_location)
        removeFile("%soutput" % rdp_exe_location)

        err, seq_id = dbsequences.GetSequenceWithNoTaxonomyID(con, cur)
        if err or seq_id == -1:
            #If no empty sequence, wait for long time
            debug(2, "go to sleep")
            tax_log += "sleep start " + datetime.datetime.now().strftime(
                "%Y-%m-%d--%H:%M:%S") + "\n"
            saveStringToFile(
                "tax_summary_log_sleep_" + date_time_str, "sleep started " +
                datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S"))
            #continue
            return  # insted of sleep, one master file run all scripts

        tax_log += "sequence id = " + str(seq_id) + "\n"

        err, seq_str = dbsequences.GetSequenceStrByID(con, cur, seq_id)
        if err:
            tax_log += "Fatal Error, could not find sequence " + "\n"
            break

        #java -Xmx1g -jar dist/classifier.jar classify  -o output_filename example.fasta
        input_file_name = "%sinput" % rdp_exe_location
        output_file_name = "%soutput" % rdp_exe_location

        #get the taxononmy for specific sequence
        createSeqFile(input_file_name, seq_str)
        os.system("java -Xmx1g -jar %sdist/classifier.jar classify  -o %s %s" %
                  (rdp_exe_location, output_file_name, input_file_name))
        tex_res = readResultFromFile(output_file_name)

        tax_log += "the data:\n"
        for line in tex_res:
            tax_log += line + "\n"
            data = line.split('\t')

        #search for the string
        prev = ""
        has_failure = False

        size_of_list = len(data)

        list_index = 0
        while list_index < size_of_list:
            has_failure = False
            curr_val = data[list_index]
            curr_val = curr_val.replace("\"", "")
            curr_val = curr_val.replace("\n", "")

            for y in rank_list:
                if curr_val == y:
                    tax_log += curr_val + " = " + prev
                    if list_index > 0 & list_index < (size_of_list - 1):
                        # keep the next and previous value
                        prev_val = data[list_index - 1]
                        next_val = data[list_index + 1]
                        #remove unnecesary characters
                        prev_val = prev_val.replace("\"", "")
                        prev_val = prev_val.replace("\n", "")
                        next_val = next_val.replace("\"", "")
                        next_val = next_val.replace("\n", "")

                        if (float(next_val) >= 0.9):
                            # Add to DB
                            if dbsequences.AddSequenceTax(
                                    con, cur, seq_id, "tax" + curr_val,
                                    prev_val) == True:
                                tax_log += " SUCCESS" + "\n"
                                count_seq_success = count_seq_success + 1
                            else:
                                tax_log += " FAILED" + "\n"
                                count_seq_failure = count_seq_failure + 1
                                has_failure = True
                        else:
                            tax_log += " FAILED (low probablility)" + "\n"
                    else:
                        tax_log += " FAILED (bad index)" + "\n"

            list_index = list_index + 1

        if has_failure == True:
            count_failure = count_failure + 1
        else:
            count_success = count_success + 1

        summary_str = "count_success = %s\ncount_failure = %s\ncount_seq_success = %s\ncount_seq_failure = %s\n" % (
            count_success, count_failure, count_seq_success, count_seq_failure)

        saveStringToFile("tax_summary_log_" + date_time_str, summary_str)
        saveStringToFile("tax_log_" + date_time_str, tax_log)
        debug(2, 'found sequence %s' % seq_str)
        debug(2, 'return %s' % tex_res)
        count = count + 1

        #stop the script in case of error
        if count_failure > 0:
            break
コード例 #51
0
def add_term_info(servertype='develop', overwrite=False, add_pairs=True, add_single=True, add_parents=True, max_annotation_terms=15):
    '''Fill the term info details for each ontology term into the TermInfoTable.
    Terms are taken from all the annotations in the database
    Term details include:
    TotalExperiments: total number of experiments the term appears in
    TotalAnnotations: total number of annotations the term appears in

    Parameters
    ----------
    servertype : str (optional)
        database to connect to ('main' or 'develop' or 'local')

    overwrite : bool (optional)
        False (default) to not overwrite existing (non-zero) seqCounts, True to delete all

    add_pairs: bool (optional)
        Add information about term pairs from each annotation
    add_single: bool, optional
        Add information about each single term in the annotation
    max_annotation_terms: int, optional
        maximal number of terms in an annotation in order to process the pairs in it
    '''
    con, cur = connect_db(servertype=servertype)

    # remove the old counts
    cur.execute('DELETE FROM TermInfoTable')

    # get the lower detailtypes (i.e. 'low'). For these types we add - before
    lowertypes = set()
    cur.execute('SELECT id FROM AnnotationDetailsTypesTable WHERE description=%s', ['low'])
    lowertypes.add(cur.fetchone()[0])

    term_id_experiments = defaultdict(set)
    term_id_annotations = defaultdict(int)
    all_term_ids = set()
    cur.execute('SELECT id, idexp from AnnotationsTable')
    res = cur.fetchall()
    debug(6, 'Getting term info from %d annotations' % len(res))
    # iterate over all annotations
    for idx, cres in enumerate(res):
        annotation_terms = set()
        if idx % 100 == 0:
            debug(4, 'processed %d annotations' % idx)
        cannotation_id = cres[0]
        cexp_id = cres[1]
        cur.execute('SELECT idontology, idannotationdetail FROM AnnotationListTable WHERE idannotation=%s', [cannotation_id])
        res2 = cur.fetchall()
        for cres2 in res2:
            cterm = cres2[0]
            all_term_ids.add(cterm)
            # if it is lower, add it as negative (we'll use it when we convert to strings...)
            if cres2[1] in lowertypes:
                cterm = -cterm
            term_id_experiments[cterm].add(cexp_id)
            term_id_annotations[cterm] += 1
            annotation_terms.add(cterm)

        if add_pairs:
            if len(annotation_terms) <= max_annotation_terms:
                pairs = tessa(list(annotation_terms))
                for cpair in pairs:
                    cpair = tuple(sorted(cpair))
                    term_id_experiments[cpair].add(cexp_id)
                    term_id_annotations[cpair] += 1

    # get the term names for all the terms we encountered
    term_id_to_name = {}
    for cterm_id in all_term_ids:
        cur.execute('SELECT description FROM OntologyTable WHERE id=%s LIMIT 1', [cterm_id])
        res = cur.fetchone()
        term_id_to_name[cterm_id] = res[0]

    debug(6, 'found %d terms' % len(term_id_experiments))
    num_single = 0
    num_pairs = 0
    for cid in term_id_experiments.keys():
        if add_single:
            if isinstance(cid, int):
                if cid > 0:
                    cterm = term_id_to_name[cid]
                else:
                    cterm = '-' + term_id_to_name[-cid]
                term_experiments = len(term_id_experiments[cid])
                term_annotations = term_id_annotations[cid]
                cur.execute('INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)', [cterm, term_experiments, term_annotations, 'single'])
                num_single += 1
        if add_pairs:
            if isinstance(cid, tuple):
                cnames = []
                for ccid in cid:
                    if ccid > 0:
                        cnames.append(term_id_to_name[ccid])
                    else:
                        cnames.append('-' + term_id_to_name[-ccid])
                cnames = sorted(cnames)
                cterm = '+'.join(cnames)
                term_experiments = len(term_id_experiments[cid])
                term_annotations = term_id_annotations[cid]
                cur.execute('INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)', [cterm, term_experiments, term_annotations, 'pair'])
                num_pairs += 1

    debug(6, 'updated %d single, %d pairs' % (num_single, num_pairs))
    debug(6, 'commiting')
    con.commit()
    debug(6, 'done')