Exemple #1
0
def delete_term(ctx, term):
	'''Add a new term to the dbbact ontology
	'''
	con = ctx.obj['con']
	cur = ctx.obj['cur']
	log_file = ctx.obj['log_file']
	term = term.lower()
	debug(3, 'delete-term for term %s' % term)
	term_id = _add_dbbact_term(con, cur, term, create_if_not_exist=False, only_dbbact=True)

	# check if it is a parent of someone
	cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyparentid=%s', [term_id])
	if cur.rowcount > 0:
		raise ValueError('The term %s is a parent of %d terms. Cannot delete' % cur.rowcount)

	# check if it appears in annotations
	cur.execute('SELECT idannotation FROM AnnotationListTable WHERE idontology = %s', [term_id])
	if cur.rowcount > 0:
		raise ValueError('The term %s appears in %d annotations. Cannot delete' % cur.rowcount)

	res = input('Delete %s (%s): Are you sure (y/n)?' % (term, term_id))
	if not res.lower() in ('y', 'yes'):
		raise ValueError('Delete aborted')

	# delete all the entries where it is a child
	cur.execute('DELETE FROM ontologytreestructuretable WHERE ontologyid=%s', [term_id])
	# and delete the term itself
	cur.execute('DELETE FROM ontologytable WHERE id=%s', [term_id])
	con.commit()
	_write_log(log_file, 'delete_term for term: %s (id: %s)' % (term, term_id))
Exemple #2
0
def delete_annotation(con, cur, annotationid, userid=0, delete=False, commit=False):
	debug(3, 'delete annotation %d' % annotationid)
	if delete:
		res = DeleteAnnotation(con, cur, annotationid=annotationid, userid=userid, commit=False)
		if res:
			debug(5, res)
	if commit:
		con.commit()
Exemple #3
0
def add_term(ctx, term):
	'''Add a new term to the dbbact ontology
	'''
	con = ctx.obj['con']
	cur = ctx.obj['cur']
	log_file = ctx.obj['log_file']
	term = term.lower()
	debug(3, 'add-term for term %s' % term)
	term_id = _add_dbbact_term(con, cur, term)
	con.commit()
	_write_log(log_file, 'add_term for term: %s (id: %s)' % (term, term_id))
Exemple #4
0
def add_term_to_annotation(ctx, old_term, new_term, experiments, add_if_not_exist):
	'''Add another term to annotations containing a given term
	'''
	con = ctx.obj['con']
	cur = ctx.obj['cur']
	log_file = ctx.obj['log_file']
	old_term = old_term.lower()
	new_term = new_term.lower()

	debug(3, 'add term %s to annotations with term %s' % (old_term, new_term))

	# not sure if multiple provides None or [], so let's make it None
	if experiments is not None:
		if len(experiments) == 0:
			experiments = None
		else:
			experiments = set(experiments)
	old_term_id = _get_term_id(con, cur, old_term, only_dbbact=False)
	if old_term_id is None:
		raise ValueError('Term %s does not exist' % old_term)

	new_term_id = _add_dbbact_term(con, cur, new_term, create_if_not_exist=add_if_not_exist, only_dbbact=False)

	# get all annotations with the old term
	cur.execute('SELECT idannotation,idannotationdetail FROM AnnotationListTable WHERE idontology=%s', [old_term_id])
	if cur.rowcount == 0:
		raise ValueError('No annotations found containing term %s' % old_term)
	debug(3, 'found %d annotations with the term %s' % (cur.rowcount, old_term))

	annotations = cur.fetchall()
	num_added = 0
	num_non_match = 0
	for cannotation in annotations:
		cannotation_id = cannotation['idannotation']
		canntation_detail = cannotation['idannotationdetail']
		if experiments is not None:
			cur.execute('SELECT idexp FROM AnnotationsTable WHERE id=%s LIMIT 1', [cannotation_id])
			if cur.rowcount == 0:
				debug(7, 'experiment ID %s not found! skipping' % cannotation_id)
				num_non_match += 1
				continue
			res = cur.fetchone()
			if res['idexp'] not in experiments:
				continue
		cur.execute('INSERT INTO AnnotationListTable (idannotation, idannotationdetail, idontology) VALUES (%s, %s, %s)', [cannotation_id, canntation_detail, new_term_id])
		num_added += 1
	debug(3, 'added new term to %d annotations (%d annotations skipped)' % (num_added, num_non_match))
	_write_log(log_file, 'add_term_to_annotation for old_term: %s (id: %s) to new_term: %s (id: %s)' % (old_term, old_term_id, new_term, new_term_id))
	con.commit()
	debug(3, 'done')
Exemple #5
0
def main(argv):
	parser = argparse.ArgumentParser(description='delete_annotations version %s\ndelete sequences not in any annotation' % __version__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('--port', help='postgres port', default=5432, type=int)
	parser.add_argument('--host', help='postgres host', default=None)
	parser.add_argument('--database', help='postgres database', default='dbbact')
	parser.add_argument('--user', help='postgres user', default='dbbact')
	parser.add_argument('--password', help='postgres password', default='magNiv')
	parser.add_argument('--annotationids', help='list of annotation ids to delete (space separated)', nargs='+', type=int)
	parser.add_argument('--expids', help='list of experiment ids to delete (space separated)', nargs='+', type=int)
	parser.add_argument('--delete', help='delete the sequences', action='store_true')
	parser.add_argument('--noseq', help='delete only annotations where all sequences do not start with noseq (i.e. acgt to not delete v4)')
	parser.add_argument('--log-level', help='output level (1 verbose, 10 error)', type=int, default=3)

	args = parser.parse_args(argv)
	SetDebugLevel(args.log_level)
	con, cur = db_access.connect_db(database=args.database, user=args.user, password=args.password, port=args.port, host=args.host)

	annotationids = []
	# fill the annotations from each experiment
	if args.expids:
		for cexpid in args.expids:
			cur.execute("SELECT id from AnnotationsTable WHERE idexp=%s", [cexpid])
			for cres in cur:
				annotationids.append(cres[0])
		debug(3, 'found %d annotations for the experiments' % len(annotationids))
	# and add the annotation ids supplied
	if args.annotationids is not None:
		annotationids.extend(args.annotationids)

	for cannotationid in annotationids:
		# test if all sequences of the annotation don't start with sequence notseq
		if args.noseq is not None:
			cur2 = con.cursor(cursor_factory=psycopg2.extras.DictCursor)
			noseq = args.noseq
			badseqs = 0
			cur.execute("SELECT seqid FROM SequencesAnnotationTable WHERE annotationid=%s", [cannotationid])
			for cres in cur:
				cseqid = cres[0]
				cur2.execute("SELECT sequence FROM SequencesTable WHERE id=%s", [cseqid])
				res = cur2.fetchone()[0]
				if res[:len(noseq)] == noseq:
					badseqs += 1
			if badseqs > 0:
				debug(5, "Annotation %d contains %d sequences starting with the noseq sequenece %s. not deleting" % (cannotationid, badseqs, noseq))
				continue

		# get the user that created the annotation
		cur.execute("SELECT iduser FROM AnnotationsTable WHERE id=%s LIMIT 1", [cannotationid])
		res = cur.fetchone()
		cuserid = res['iduser']
		# and delete
		delete_annotation(con, cur, annotationid=cannotationid, userid=cuserid, delete=args.delete)

	debug(3, 'committing')
	con.commit()
	debug(3, 'done. please run delete_unused_seqs.py to remove unused sequences')
Exemple #6
0
def _get_term_id(con, cur, term, fail_if_not_there=True, only_dbbact=True):
	'''Get the idx of a given dbBact term description or term_id
	if more than 1 match exists, get the dbBact match

	Parameters
	----------
	con, cur
	term: str
		the term description or term_id (dbbact:XXXX) to look for
	fail_if_not_there: bool, optional
		if True, fail if term does not exist. If false, return None instead
	only_dbbact: bool, optional
		if True, return only IDs for terms in dbbact ontology. If false, return term id for any ontology

	Returns
	-------
	id: int
	'''
	cur.execute("SELECT * FROM ontologytable WHERE term_id=%s", [term])
	if cur.rowcount == 0:
		cur.execute("SELECT * FROM ontologytable WHERE description=%s", [term])
	res = cur.fetchall()
	num_dbbact = 0
	for cres in res:
		if only_dbbact:
			if not cres['term_id'].startswith('dbbact:'):
				continue
		term_id = cres['id']
		num_dbbact += 1
	if num_dbbact == 0:
		if fail_if_not_there:
			if only_dbbact:
				raise ValueError('Term %s not found in dbbact ontology. Found in %d non-dbbact' % (term, len(res)))
			else:
				raise ValueError('Term %s not found' % term)
		else:
			debug(2, 'term %s not found' % term)
			return None
	if num_dbbact > 1:
		raise ValueError('Term %s has >1 (%d) dbBact matches' % (term, num_dbbact))

	debug(2, 'term found with 1 instance in ontologytable. id=%d' % term_id)
	return term_id
def update_old_primer_seqs(con, cur, old_primer, new_primer, commit=True):
    '''update all sequences with primer old_primer to new primer new_primer
	'''
    cur.execute("SELECT sequence FROM SequencesTable WHERE idprimer=%s",
                [old_primer])
    debug(3,
          'found %d sequences with old primer %d' % (cur.rowcount, old_primer))
    seqs = []
    res = cur.fetchall()
    for cres in res:
        seqs.append(cres['sequence'])

    for cseq in seqs:
        update_sequence_primer(con,
                               cur,
                               sequence=cseq,
                               primer=new_primer,
                               commit=False)
    if commit:
        con.commit()
Exemple #8
0
def add_annotation_seq_count(con, cur):
    cur2 = con.cursor(cursor_factory=psycopg2.extras.DictCursor)

    debug(3, 'add_annotation_seq_count started')
    debug(2, 'processing annotations')
    # iterate over all annotations
    num_anno = 0
    cur.execute('SELECT id FROM AnnotationsTable')
    for cres in cur:
        cid = cres['id']
        cur2.execute(
            'SELECT COUNT(*) FROM sequencesannotationtable WHERE annotationid=%s',
            [cid])
        cres2 = cur2.fetchone()
        num_seqs = cres2[0]
        cur2.execute('UPDATE annotationstable SET seqcount=%s WHERE id=%s',
                     [num_seqs, cid])
        num_anno += 1
    debug(2, 'scanned %d annotations.' % num_anno)
    debug(2, 'committing')
    con.commit()
    debug(3, 'done')
def add_dbbact_ids(con, cur):
	''' Add the approriate dbbact ontology id (term_id) to each dbbact term in OntologyTable
	need to run once since we didn't dutomatically set when adding new term
	'''
	debug(3, 'getting terms without ontology term_id')
	cur.execute("SELECT * FROM ontologytable WHERE term_id=''")
	res = cur.fetchall()
	debug(3, 'found %d terms' % len(res))
	for cres in res:
		cid = cres['id']
		new_id_ontology = 'dbbact:%s' % cid
		cur.execute('UPDATE ontologytable SET term_id=%s WHERE id=%s', [new_id_ontology, cid])
	debug(3, 'committing')
	con.commit()
	debug(3, 'done')
Exemple #10
0
def fix_na(con, cur, commit=False):
	'''Update the OntologyTreeStructureTable to fix the old na root term (which was undefined as contained many optional NAs)

	Parameters
	----------
	con, cur: dbbact psycopg2 database connection and cursor
	commit: bool, optional
		True to commit changes, False to just perform dry run
	'''
	# find the id of the dbbact ontology
	cur.execute('SELECT * FROM ontologynamestable WHERE description=%s', ['dbbact'])
	res = cur.fetchone()
	ontologynameid = res['id']
	if ontologynameid != 8:
		raise ValueError('strange dbbact ontologynameid: %s (instead of 8)' % ontologynameid)

	# find the dbbact root term id "dbbact root" (id 1811274)
	cur.execute('SELECT * from OntologyTable WHERE description=%s', ['dbbact root'])
	res = cur.fetchone()
	if res['term_id'] != 'dbbact:1811274':
		raise ValueError('"dbbact root" term_id is %s instead of dbbact:1811274' % res['term_id'])
	root_id = res['id']

	cur.execute('SELECT * FROM OntologyTable WHERE term_id LIKE %s', ['dbbact:%'])
	debug(3, 'Found %d dbbact terms' % cur.rowcount)
	res = cur.fetchall()
	num_na_parents = 0
	for cres in res:
		cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyid=%s', [cres['id']])
		tres = cur.fetchall()
		for ctres in tres:
			cur.execute('SELECT * FROM OntologyTable WHERE id=%s LIMIT 1', [ctres['ontologyparentid']])
			if cur.rowcount == 0:
				continue
			ttres = cur.fetchone()
			if ttres['description'] == 'na':
				cur.execute('UPDATE OntologyTreeStructureTable SET ontologyparentid=%s, ontologynameid=%s WHERE uniqueid=%s', [root_id, ontologynameid, ctres['uniqueid']])
				num_na_parents += 1
	debug(4, 'updating %d dbbact terms roots' % num_na_parents)
	if commit:
		con.commit()
		debug(3, 'commited')
	debug(3, 'done')
Exemple #11
0
def delete_unused_seqs(con, cur, delete=False):
    debug(3, 'delete unused seqs started')
    if delete:
        cur.execute(
            'DELETE FROM SequencesTable WHERE NOT EXISTS(SELECT SequencesAnnotationTable.seqid FROM SequencesAnnotationTable WHERE SequencesAnnotationTable.seqid = SequencesTable.id)'
        )
        debug(3, 'deleted')
        con.commit()
    else:
        cur.execute(
            'SELECT * FROM SequencesTable WHERE NOT EXISTS(SELECT SequencesAnnotationTable.seqid FROM SequencesAnnotationTable WHERE SequencesAnnotationTable.seqid = SequencesTable.id)'
        )
        print('NOT DELETING, but found %d sequences to delete' % cur.rowcount)
        debug(3,
              'NOT DELETING, but found %d sequences to delete' % cur.rowcount)
def prepare_dbbact_calour_term_files(con, cur, outdir='./', include_synonyms=True):
	'''Prepare the 2 ontology term pickle files needed for dbbact_calour new annotation term autocomplete.

	Output is saved into 2 files:

	ontology.pickle:
		dict of {name(str): ontologyid(str)}
			name:
				contains the full term/sysnonim name + "(+"ONTOLOGY NAME+"original term + ")". This is the string displayed to the user
			ontologyid:
				contains a unique id for this term that appears in the data/ontologyfromid.pickle file (loaded to DBAnnotateSave._ontology_from_id).

	ontologyfromid.pickle:
		dict of {ontologyid(str): term(str)}
			ontologyid:
				contains a unique id for each of the terms (linked from data/ontologies.pickle or DBAnnotateSave._ontology_dict)
			term:
				the dbbact term name

		For example for the term "united states of america" we have in DBAnnotateSave._ontology_dict key "U.S.A. :GAZ(United States of America)" with value GAZ:00002459
		and in DBAnnotateSave._ontology_from_id we have key "GAZ:00002459" with value "United States of America"

		Parameters
		----------
		outdir: str, optional
			name of the output dir where to save the pickle files
		include_synonyms: bool, optional
			True to add also all entries from synonyms table
	'''
	debug(3, 'Counting all terms in dbBact')
	cur2 = con.cursor()
	cur2.execute('PREPARE find_syn(int) AS SELECT synonym FROM OntologySynonymTable WHERE idontology=$1')
	cur.execute('SELECT id, description, term_id FROM OntologyTable')
	num_terms_found = cur.rowcount
	debug(4, 'found %d terms' % num_terms_found)

	term_name_id = defaultdict(dict)
	term_id_term = defaultdict(dict)
	num_terms = 0
	while True:
		res = cur.fetchone()
		if res is None:
			break
		num_terms += 1
		if num_terms % 100000 == 0:
			debug(3, '%s (scanned %d/%d)' % (res, num_terms, num_terms_found))

		term_names = [res['description']]
		main_term = res['description']

		ontology_name = 'dbbact'
		if ':' in res['term_id']:
			ontology_name = res['term_id'].split(':')[0]

		cterm_id = res['term_id']
		if cterm_id == '':
			cterm_id = 'dbbact:%d' % res['id']

		# also get all the synonyms for the term if needed
		if include_synonyms:
			cur2.execute('EXECUTE find_syn(%s)', [res['id']])
			if cur2.rowcount > 0:
				for cres2 in cur2:
					term_names.append(cres2[0])

		for cterm in term_names:
			# if a synonym, put the original term in the parenthesis
			if cterm != main_term:
				term_name_id[ontology_name]['%s (%s - %s)' % (cterm, main_term, cterm_id)] = res['id']
			# not sysnonym, so no need to add the original term - just the ENVO:XXXXX etc.
			else:
				term_name_id[ontology_name]['%s (%s)' % (cterm, cterm_id)] = res['id']
			term_id_term[ontology_name][res['id']] = res['description']

	# move small ontologies to 'other' ontology
	small_ontologies = []
	all_ontologies = list(term_id_term.keys())
	for contology in all_ontologies:
		if len(term_name_id[contology]) < 500:
			term_name_id['other'].update(term_name_id[contology])
			term_id_term['other'].update(term_id_term[contology])
			del term_name_id[contology]
			del term_id_term[contology]
			small_ontologies.append(contology)
	print('moved %d small ontologies into "other" ontology:\n%s' % (len(small_ontologies), small_ontologies))

	# and save
	for contology in term_id_term.keys():
		with open(os.path.join(outdir, contology + '.ontology.pickle'), 'wb') as ofl:
			pickle.dump(term_name_id[contology], ofl)
		with open(os.path.join(outdir, contology + '.ontology.ids.pickle'), 'wb') as ofl:
			pickle.dump(term_id_term[contology], ofl)
Exemple #13
0
def rename_term(ctx, old_term, new_term, experiments, add_if_not_exist, ignore_no_annotations, inplace):
	'''replace a term with another term in all annotations. If inplace=True, just change the description of the term
	If the new term does not exist, dbBact creates it into the dbbact ontology
	'''
	con = ctx.obj['con']
	cur = ctx.obj['cur']
	log_file = ctx.obj['log_file']
	old_term = old_term.lower()
	new_term = new_term.lower()

	debug(3, 'rename term %s to term %s' % (old_term, new_term))

	# not sure if multiple provides None or [], so let's make it None
	if experiments is not None:
		if len(experiments) == 0:
			experiments = None

	if experiments is not None:
		if inplace:
			raise ValueError('Cannot replcae in place in a subset of experiments.')

	old_term_id = _get_term_id(con, cur, old_term, only_dbbact=False)
	if old_term_id is None:
		raise ValueError('Term %s does not exist' % old_term)

	if inplace:
		cur.execute('SELECT * FROM OntologyTable WHERE description=%s', [new_term])
		if cur.rowcount > 0:
			raise ValueError('new term %s already exists as description' % new_term)
		cur.execute('SELECT * FROM OntologyTable WHERE term_id=%s', [new_term])
		if cur.rowcount > 0:
			raise ValueError('new term %s already exists as term_id' % new_term)
		cur.execute('UPDATE OntologyTable SET description=%s WHERE id=%s', [new_term, old_term_id])
		_write_log(log_file, 'rename_term for old_term: %s (id: %s) to new_term: %s in place' % (old_term, old_term_id, new_term))
		con.commit()
		debug(3, 'done')
		return

	new_term_id = _add_dbbact_term(con, cur, new_term, create_if_not_exist=add_if_not_exist, only_dbbact=False)

	# get all annotations with the old term
	cur.execute('SELECT idannotation FROM AnnotationListTable WHERE idontology=%s', [old_term_id])
	if cur.rowcount == 0:
		if not ignore_no_annotations:
			raise ValueError('No annotations found containing term %s' % old_term)
	debug(3, 'found %d annotations with the term %s' % (cur.rowcount, old_term))

	# update to the new term
	if experiments is None:
		cur.execute('UPDATE AnnotationListTable SET idontology=%s WHERE idontology=%s', [new_term_id, old_term_id])
	else:
		num_match = 0
		match_exps = set()
		non_match_exps = set()
		num_non_match = 0
		experiments = set(experiments)
		annotations = cur.fetchall()
		for cannotation in annotations:
			cannotation_id = cannotation['idannotation']
			cur.execute('SELECT idexp FROM AnnotationsTable WHERE id=%s LIMIT 1', [cannotation_id])
			if cur.rowcount == 0:
				debug(7, 'experiment ID %s not found! skipping' % cannotation_id)
				continue
			res = cur.fetchone()
			if res['idexp'] in experiments:
				num_match += 1
				match_exps.add(res['idexp'])
				cur.execute('UPDATE AnnotationListTable SET idontology=%s WHERE idontology=%s AND idannotation=%s', [new_term_id, old_term_id, cannotation_id])
			else:
				num_non_match += 1
				non_match_exps.add(res['idexp'])
		debug(3, 'found %d annotations (%d experiments) with a matching expid, %d (%d) without' % (num_match, len(match_exps), num_non_match, len(non_match_exps)))

	# update the ontology parents table - only if we did not do a partial update
	if experiments is None:
		cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyparentid=%s', [old_term_id])
		if cur.rowcount > 0:
			debug(3, 'Found %d terms with %s as parent term. Updating' % (cur.rowcount, old_term))
			res = cur.fetchall()
			for cres in res:
				cur.execute('UPDATE OntologyTreeStructureTable SET ontologyparentid=%s WHERE uniqueid=%s', [new_term_id, cres['uniqueid']])

	_write_log(log_file, 'rename_term for old_term: %s (id: %s) to new_term: %s (id: %s)' % (old_term, old_term_id, new_term, new_term_id))
	con.commit()
	debug(3, 'done')
def update_sequence_primer(con, cur, sequence, primer, commit=True):
    '''Update the primer region for the sequence.
	If the sequence already appears in dbBact with a different primer region, merge the two using the other region sequence

	Parameters
	----------
	con, cur:
	sequence: str
		the exact sequence to update (acgt)
	primer: int or str
		the primer region id (int) or name (str - i.e. 'v4') to update
	commit: bool, optional
		if True, commit after update

	Returns
	-------
	error (str) or ''
	'''
    debug(
        2, 'update_sequence_primer for sequence %s new region %s' %
        (sequence, primer))
    # setup the primer to be the id
    if not isinstance(primer, int):
        primer = GetIdFromName(con, cur, primer)
    # get the sequence id. Note we use idprimer=None since we don't want to look for the new region
    err, seqids = GetSequenceId(con,
                                cur,
                                sequence=sequence,
                                idprimer=None,
                                no_shorter=True,
                                no_longer=True,
                                seq_translate_api=None)
    if err:
        return err
    debug(2, 'found %d total matches to the sequence' % len(seqids))
    if len(seqids) == 0:
        msg = 'trying to update sequence %s failed since it is not in SequencesTable' % sequence
        debug(4, msg)
        return msg
    # do we also have the same sequence with the correct primer?
    err, okid = GetSequenceId(con,
                              cur,
                              sequence=sequence,
                              idprimer=primer,
                              no_shorter=True,
                              no_longer=True,
                              seq_translate_api=None)
    if err:
        if err != 'primer mismatch':
            debug(5, err)
            return err
    # no region matches so choose the first, update it, and move all the others to it
    if len(okid) == 0:
        debug(
            1,
            'could not find sequence with good region. chose seqid %d and updating it'
            % seqids[0])
        okid = seqids[0]
        cur.execute('UPDATE SequencesTable SET idprimer=%s WHERE id=%s',
                    [primer, okid])
    else:
        debug(
            3,
            'found good sequence id %s. transferring annotations to id' % okid)
        if len(okid) > 1:
            debug(
                4,
                'strange. found %d exact matches including region' % len(okid))
        okid = okid[0]
    # now transfer all annotations from the wrong region sequence to the ok (match) sequence and delete the wrong region sequences
    for cseqid in seqids:
        if cseqid == okid:
            continue
        debug(
            4,
            'moving seqid %d to ok sequence %d and deleting' % (cseqid, okid))
        cur.execute(
            'UPDATE SequencesAnnotationTable SET seqid=%s WHERE seqid=%s',
            [okid, cseqid])
        cur.execute('DELETE FROM SequencesTable WHERE id=%s', [cseqid])
    if commit:
        debug(3, 'committing')
        con.commit()
    debug(1, 'update finished')
    return ''
Exemple #15
0
def add_parent(ctx, term, parent, add_if_not_exist, old_parent, only_dbbact):
	'''Link a dbBact ontology term to a dbBact parent term.
	If the parent term does not exist, dbBact creates it
	'''
	con = ctx.obj['con']
	cur = ctx.obj['cur']
	commit = ctx.obj['commit']
	log_file = ctx.obj['log_file']
	term = term.lower()
	parent = parent.lower()

	debug(3, 'add parent %s to term %s' % (parent, term))
	term_id = _get_term_id(con, cur, term)
	parent_term_id = _add_dbbact_term(con, cur, parent, create_if_not_exist=add_if_not_exist, only_dbbact=only_dbbact)

	# to be safe, get the dbBact ontology number
	cur.execute('SELECT id FROM ontologynamestable WHERE description=%s', ['dbbact'])
	ontology_database_id = cur.fetchone()[0]
	debug(2, 'dbBact database id=%s' % ontology_database_id)
	if ontology_database_id != 8:
		raise ValueError('dbbact id is not 8! it is %d' % ontology_database_id)

	# check if it had "dbbact root" (id 1811274) as parent - remove it
	cur.execute('DELETE FROM ontologytreestructuretable WHERE ontologynameid=8 AND ontologyparentid=1811274 AND ontologyid=%s', [term_id])

	cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyid=%s', [term_id])
	if cur.rowcount > 0:
		debug(3, 'old parents (%d) found for term' % cur.rowcount)
		if old_parent == 'replace':
			if cur.rowcount > 1:
				raise ValueError('More than 1 parent for term (%d). Cannot replace.' % cur.rowcount)
			# remove the old parent
			res = cur.fetchone()
			cur.execute('DELETE FROM ontologytreestructuretable WHERE uniqueid=%s', [res['uniqueid']])
		elif old_parent == 'insert':
			if cur.rowcount > 1:
				raise ValueError('More than 1 parent for term (%d). Cannot insert.' % cur.rowcount)
			# add our parent term in the middle
			res = cur.fetchone()
			cur.execute('SELECT term_id FROM ontologytable WHERE id=%s', [res['ontologyparentid']])
			idres = cur.fetchone()
			ctx.invoke(add_parent, term=parent, parent=idres['term_id'], add_if_not_exist=False, old_parent='fail')
			# and remove the old parent connection
			cur.execute('DELETE FROM ontologytreestructuretable WHERE uniqueid=%s', [res['uniqueid']])
		elif old_parent == 'ignore':
			debug('term already has parents (%d). Ignoring and adding new parent' % cur.rowcount)
		elif old_parent == 'fail':
			raise ValueError('Parents (%d) already exists for term. To override use the old-parent option' % cur.rowcount)

	# add to the OntologyTreeStructureTable
	cur.execute('INSERT INTO ontologytreestructuretable (ontologyid, ontologyparentid, ontologynameid) VALUES (%s, %s, %s)', [term_id, parent_term_id, ontology_database_id])
	debug(3, 'Inserted into ontologytreestructuretable')
	if commit:
		_write_log(log_file, 'add_parent for term: %s (id: %s) parent: %s (id: %s)' % (term, term_id, parent, parent_term_id))
		con.commit()
	else:
		debug(5, 'dry run - not commiting')
	debug(3, 'done')
Exemple #16
0
def term_info(ctx, term, partial, no_parent):
	'''Get information about a dbBact term
	'''
	con = ctx.obj['con']
	cur = ctx.obj['cur']
	log_file = ctx.obj['log_file']
	term = term.lower()

	debug(3, 'term-info for term %s' % term)
	cur.execute('SELECT * FROM ontologytable WHERE term_id=%s', [term])
	if cur.rowcount == 0:
		if partial:
			cur.execute('SELECT * FROM ontologytable WHERE description LIKE %s', [term + '%'])
		else:
			cur.execute('SELECT * FROM ontologytable WHERE description=%s', [term])
	res = cur.fetchall()
	for cres in res:
		cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyid=%s', [cres['id']])
		skip_it = False
		all_parents = []
		if cur.rowcount > 0:
			parents = cur.fetchall()
			for cparent in parents:
				cur.execute('SELECT * FROM OntologyTable WHERE id=%s LIMIT 1', [cparent['ontologyparentid']])
				cinfo = cur.fetchone()
				all_parents.append('%s (%s)' % (cinfo['description'], cinfo['term_id']))
				if cinfo['term_id'] == 'dbbact:1811274':
					continue
				skip_it = True
		if skip_it:
			if no_parent:
				continue
		print('\n*******************')
		print('TERM: %s (TERM_ID: %s )' % (cres['description'], cres['term_id']))
		print(list(cres.items()))
		print('===================')
		print('PARENTS:')
		for cparent in all_parents:
			print(cparent)
		print('CHILDREN:')
		cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyparentid=%s', [cres['id']])
		children = cur.fetchall()
		for cchild in children:
			cur.execute('SELECT * FROM ontologytable WHERE id=%s LIMIT 1', [cchild['ontologyid']])
			cchilddet = cur.fetchone()
			print(cchilddet['description'])
		annotation_ids = []
		exp_names = set()
		print('ANNOTATIONS:')
		cur.execute('SELECT idannotation FROM AnnotationListTable WHERE idontology = %s', [cres['id']])
		res2 = cur.fetchall()
		for cres2 in res2:
			annotation_ids.append(cres2['idannotation'])
		print('total %d annotations' % len(annotation_ids))
		for canno in annotation_ids:
			cur.execute('SELECT idexp FROM AnnotationsTable WHERE id=%s LIMIT 1', [canno])
			res2 = cur.fetchone()
			cur.execute('SELECT * FROM ExperimentsTable WHERE expid=%s', [res2['idexp']])
			res2 = cur.fetchall()
			for cexp in res2:
				if cexp['type'] != 'name':
					continue
				exp_names.add('%s (expid: %s)' % (cexp['value'], cexp['expid']))
		print('----------------')
		print('Experiments:')
		for cname in exp_names:
			print(cname)
Exemple #17
0
def update_term_info_old(con, cur):
    cur2 = con.cursor(cursor_factory=psycopg2.extras.DictCursor)
    cur3 = con.cursor(cursor_factory=psycopg2.extras.DictCursor)

    debug(3, 'update_term_info started')
    debug(2, 'dropping old TermInfoTable')
    cur.execute('DELETE FROM TermInfoTable')
    debug(2, 'processing terms')
    cur.execute('SELECT id,description FROM OntologyTable')
    for idx, cres in enumerate(cur):
        term_exps_pos = set()
        term_exps_neg = set()
        term_annotations_pos = set()
        term_annotations_neg = set()
        ctermid = cres['id']
        cterm = cres['description']
        # get all the annotations containing this term
        cur2.execute(
            'SELECT idannotation,idannotationdetail FROM AnnotationListTable WHERE idontology=%s',
            [ctermid])
        for ctres in cur2:
            ctype = ctres['idannotationdetail']
            cannotation = ctres['idannotation']

            # get more info about the annotation
            cur3.execute(
                'SELECT idexp FROM AnnotationsTable WHERE id=%s LIMIT 1',
                [cannotation])
            cares = cur3.fetchone()
            cexp = cares['idexp']

            # if it's "LOWER IN cterm" it is neg
            if ctype == 2:
                term_exps_neg.add(cexp)
                term_annotations_neg.add(cannotation)
            else:
                term_exps_pos.add(cexp)
                term_annotations_pos.add(cannotation)

        cur2.execute(
            'INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)',
            [cterm,
             len(term_exps_pos),
             len(term_annotations_pos), 'single'])
        cur2.execute(
            'INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)',
            [
                '-' + cterm,
                len(term_exps_neg),
                len(term_annotations_neg), 'single'
            ])
        if idx % 1000 == 0:
            debug(
                2,
                'processed term %d: %s. pos exps %d, pos anno %d, neg exps %d, neg anno %d'
                % (idx, cterm, len(term_exps_pos), len(term_annotations_pos),
                   len(term_exps_neg), len(term_annotations_neg)))
        if cterm == 'small village':
            debug(
                2,
                'processed term %d: %s. pos exps %d, pos anno %d, neg exps %d, neg anno %d'
                % (idx, cterm, len(term_exps_pos), len(term_annotations_pos),
                   len(term_exps_neg), len(term_annotations_neg)))

    debug(2, 'committing')
    con.commit()
    debug(3, 'done')
Exemple #18
0
def update_term_info(con, cur):
    cur2 = con.cursor(cursor_factory=psycopg2.extras.DictCursor)
    debug(3, 'update_term_info started')
    debug(2, 'dropping old TermInfoTable')
    cur.execute('DELETE FROM TermInfoTable')
    debug(2, 'processing annotations')
    term_pos_exps = defaultdict(set)
    term_neg_exps = defaultdict(set)
    term_pos_anno = defaultdict(set)
    term_neg_anno = defaultdict(set)
    all_term_ids = set()
    # iterate all annotationes / annotationsdetails
    cur.execute('SELECT id, idexp FROM AnnotationsTable')
    for idx, cres in enumerate(cur):
        cannoid = cres['id']
        cexp = cres['idexp']
        if idx % 1000 == 0:
            debug(2, 'processing annotation %d' % cannoid)
        cur2.execute(
            'SELECT idontology, idannotationdetail FROM AnnotationListTable WHERE idannotation=%s',
            [cannoid])
        for cdres in cur2:
            ctype = cdres['idannotationdetail']
            ctermid = cdres['idontology']
            all_term_ids.add(ctermid)
            # if LOWER IN
            if ctype == 2:
                term_neg_exps[ctermid].add(cexp)
                term_neg_anno[ctermid].add(cannoid)
            else:
                term_pos_exps[ctermid].add(cexp)
                term_pos_anno[ctermid].add(cannoid)

    debug(3, 'Found %d terms' % len(all_term_ids))
    debug(2, 'adding stats to TermInfoTable')
    for ctermid in all_term_ids:
        cur2.execute(
            'SELECT description FROM OntologyTable WHERE id=%s LIMIT 1',
            [ctermid])
        if cur2.rowcount == 0:
            debug(
                5, 'no term name in OntologyTable for termid %d. skipping' %
                ctermid)
            continue
        res = cur2.fetchone()
        cterm = res[0]
        tot_exps_pos = len(term_pos_exps[ctermid])
        tot_anno_pos = len(term_pos_anno[ctermid])
        tot_exps_neg = len(term_neg_exps[ctermid])
        tot_anno_neg = len(term_neg_anno[ctermid])
        if ctermid in term_pos_exps:
            # test if we already have the term in the terminfotable
            # if the term was already added (so same term name with 2 different term_ids (from 2 ontologies) in different annotations)
            # we want to agglomerate the count
            cur2.execute(
                'SELECT TotalExperiments, TotalAnnotations FROM TermInfoTable WHERE term=%s LIMIT 1',
                [cterm])
            if cur2.rowcount > 0:
                res = cur2.fetchone()
                debug(2, 'already found %s' % cterm)
                tot_exps_pos += res[0]
                tot_anno_pos += res[1]
                cur2.execute('DELETE FROM TermInfoTable WHERE term=%s',
                             [cterm])
            cur2.execute(
                'INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)',
                [cterm, tot_exps_pos, tot_anno_pos, 'single'])
        if ctermid in term_neg_exps:
            # test if we already have the term in the terminfotable
            # if the term was already added (so same term name with 2 different term_ids (from 2 ontologies) in different annotations)
            # we want to agglomerate the count
            cur2.execute(
                'SELECT TotalExperiments, TotalAnnotations FROM TermInfoTable WHERE term=%s LIMIT 1',
                ['-' + cterm])
            if cur2.rowcount > 0:
                res = cur2.fetchone()
                debug(2, 'already found -%s' % cterm)
                tot_exps_neg += res[0]
                tot_anno_neg += res[1]
                cur2.execute('DELETE FROM TermInfoTable WHERE term=%s',
                             ['-' + cterm])
            cur2.execute(
                'INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)',
                ['-' + cterm, tot_exps_neg, tot_anno_neg, 'single'])

    debug(2, 'committing')
    con.commit()
    debug(3, 'done')
Exemple #19
0
def update_obsolete_terms(con,
                          cur,
                          ontofilename,
                          ontology_name=None,
                          commit=True):
    '''replace obsolete terms as indicated by "replaved_by" in the new ontology.
	This is done by updating the annotations - the old term is replaced by the new term
	NOTE: this is done only if the term only participates in a single ontology (in the tree structure)

	Parameters
	----------
	con, cur: dbbact psycopg2 database connection and cursor
	ontofilename : str
		name of the .obo ontology file to add
	ontology_name : str or None, optional
		if not None, update only terms that appear only in this ontology tree (i.e. 'silva')
	'''
    # we need 2 phases since in dbbact we store the name, whereas the replaced_by stores the id
    if ontology_name is not None:
        cur.execute(
            'SELECT id FROM OntologyNamesTable WHERE description=%s LIMIT 1',
            [ontology_name])
        if cur.rowcount == 0:
            raise ValueError(
                'ontology %s not found in OntologyNamesTable. stopping')
        ontology_name_id = cur.fetchone()[0]
    else:
        ontology_name_id = None
    debug(3, 'phase 1: getting obsolete terms')
    # phase1 - get the required ids
    parser = oboparse.Parser(open(ontofilename))
    ids_to_get = defaultdict(list)
    num_obsolete = 0
    num_to_replace = 0
    for citem in parser:
        tags = citem.tags
        cid = tags["id"][0]
        # just obsolete terms
        if "is_obsolete" not in tags:
            continue
        if tags["is_obsolete"][0].lower() != 'true':
            continue
        num_obsolete += 1
        # and we need the replaced_by field
        if "replaced_by" not in tags:
            continue
        replaced_id = tags['replaced_by'][0].lower()
        if replaced_id == 'false':
            continue
        if "name" not in tags:
            continue
        orig_name = tags['name'][0].lower()
        orig_name = re.sub('obsolete ', '', orig_name, 1)
        ids_to_get[replaced_id].append(orig_name)
        num_to_replace += 1
    debug(
        3, 'found %d obsolete terms. %d to replace, with %d new term ids' %
        (num_obsolete, num_to_replace, len(ids_to_get)))

    debug(3, 'phase2: replacing original terms in annotations')
    # phase2: go over all terms, and if in list, replace these new values instead of the old ones
    parser = oboparse.Parser(open(ontofilename))
    for citem in parser:
        tags = citem.tags
        cid = tags["id"][0].lower()
        if cid not in ids_to_get:
            continue
        if 'name' not in tags:
            debug(4, 'need to replace with term %s but no name supplied' % cid)
            continue
        cname = tags['name'][0]
        cur.execute(
            'SELECT id FROM OntologyTable WHERE description=%s LIMIT 1',
            [cname])
        if cur.rowcount == 0:
            debug(6, 'new term %s not found in ontology table' % cname)
            continue
        contoid = cur.fetchone()[0]
        for cobsolete_term in ids_to_get[cid]:
            cur.execute(
                'SELECT id FROM OntologyTable WHERE description=%s LIMIT 1',
                [cobsolete_term])
            if cur.rowcount == 0:
                debug(
                    6,
                    'obsolete term %s for new term %s not found in ontology table'
                    % (cobsolete_term, cname))
                continue
            cobsolete_id = cur.fetchone()[0]
            if ontology_name_id is not None:
                # make sure the obsolete term does not participate in other ontologies
                cur.execute(
                    'SELECT ontologynameid FROM OntologyTreeStructureTable WHERE ontologyid=%s AND ontologynameid!=%s',
                    [cobsolete_id, ontology_name_id])
                if cur.rowcount > 0:
                    debug(
                        6,
                        'obsolete term %s participates in other ontologies. skipping'
                        % cobsolete_term)
                    continue
                cur.execute(
                    'UPDATE OntologyTable SET replaced_by=%s WHERE id=%s',
                    [contoid, cobsolete_id])

            debug(
                3, 'for term %s (%d) replace with term %s (%d)' %
                (cobsolete_term, cobsolete_id, cname, contoid))
            cur.execute(
                'SELECT idannotation, idannotationdetail FROM AnnotationListTable WHERE idontology=%s',
                [cobsolete_id])
            debug(3, 'got %d annotations with this term' % cur.rowcount)
            res = cur.fetchall()
            for cres in res:
                cidannotation = cres['idannotation']
                cidannotationdetail = cres['idannotationdetail']
                cur.execute(
                    'SELECT * FROM AnnotationListTable WHERE idannotation=%s AND idannotationdetail=%s AND idontology=%s',
                    [cidannotation, cidannotationdetail, contoid])
                if cur.rowcount == 0:
                    cur.execute(
                        'INSERT INTO AnnotationListTable (idannotation, idannotationdetail, idontology) VALUES (%s, %s, %s)',
                        [cidannotation, cidannotationdetail, contoid])
                else:
                    debug(
                        5, 'entry already exists for annotation %d' %
                        cidannotation)
                cur.execute(
                    'DELETE FROM AnnotationListTable WHERE idannotation=%s AND idannotationdetail=%s AND idontology=%s',
                    [cidannotation, cidannotationdetail, cobsolete_id])
            debug(
                3, 'did it for term %s replace with term %s' %
                (cobsolete_term, cname))
    if commit:
        con.commit()
    debug(3, 'done')
Exemple #20
0
def add_seq_counts(con, cur):
    cur2 = con.cursor(cursor_factory=psycopg2.extras.DictCursor)

    seq_exps = defaultdict(set)
    seq_annotations = defaultdict(set)

    debug(3, 'add_seq_counts started')
    debug(2, 'processing sequences')
    cur.execute('SELECT seqid,annotationid FROM SequencesAnnotationTable')
    for cres in cur:
        cseq_id = cres['seqid']
        canno_id = cres['annotationid']
        cur2.execute('SELECT idexp FROM AnnotationsTable WHERE id=%s LIMIT 1',
                     [canno_id])
        cres2 = cur2.fetchone()
        if cur2.rowcount != 0:
            cexp_id = cres2[0]
            seq_exps[cseq_id].add(cexp_id)
            if canno_id in seq_annotations[cseq_id]:
                debug(
                    5, 'sequence %s already associated with annotation %s' %
                    (cseq_id, canno_id))
            seq_annotations[cseq_id].add(canno_id)
        else:
            debug(
                5,
                'sequence %s annotationid %s does not exist in annotationstable'
                % (cseq_id, canno_id))

    debug(2, 'found data for %d sequences' % len(seq_exps))
    debug(2, 'adding total_annotations, total_experiments to SequencesTable')
    for cseq_id in seq_annotations.keys():
        cur.execute(
            'UPDATE SequencesTable SET total_annotations=%s, total_experiments=%s WHERE id=%s',
            [len(seq_annotations[cseq_id]),
             len(seq_exps[cseq_id]), cseq_id])
    con.commit()
    debug(3, 'done')
def delete_unused_terms(con, cur, commit=True):
	'''Delete all unused terms from OntologyTable
	only delete terms that are not in annotations or tree structure

	Parameters
	----------
	con, cur
	commit: bool, optional
	True to commit the changes to the database. False to run without changing
	'''
	debug(3, 'deleting unused terms')
	num_deleted = 0
	cur.execute('SELECT id, description FROM OntologyTable')
	res = cur.fetchall()
	debug(3, 'found %d terms' % len(res))
	for cres in res:
		cid = cres['id']
		cterm = cres['description']
		# do we use it in an annotation?
		cur.execute('SELECT * FROM AnnotationListTable WHERE idontology=%s', [cid])
		if cur.rowcount > 0:
			continue
		# is it in the ontology tree as child?
		cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyid=%s', [cid])
		if cur.rowcount > 0:
			continue
		# or as parent?
		cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyparentid=%s', [cid])
		if cur.rowcount > 0:
			continue
		# ok so not used, let's delete it
		# first delete from synonymstable
		cur.execute('DELETE FROM OntologySynonymTable WHERE idontology=%s', [cid])
		cur.execute('DELETE FROM OntologyTable WHERE id=%s', [cid])
		num_deleted += 1
	debug(3, 'found %d unused terms to delete' % num_deleted)
	if commit:
		con.commit()
		debug(3, 'committed')
	else:
		debug(4, 'not committing changes. nothing was deleted')
	debug(3, 'done')
def add_primer_to_annotations(con, cur, update_all=False, commit=True):
    '''Update the primerID field in the AnnotationsTable according to the sequences in the annotation

	Parameters
	----------
	update_all: bool, optional
		if True, update all annotations. If False, update only annotations with 'na' (primerID=0) in the primerId field)
	commit: bool, optional
		True to commit changes to database
	'''
    if update_all:
        cur.execute('SELECT id FROM AnnotationsTable')
    else:
        cur.execute('SELECT id FROM AnnotationsTable WHERE PrimerID=0')
    res = cur.fetchall()
    idx = 0
    for idx, cres in enumerate(res):
        cid = cres['id']
        cur.execute(
            'SELECT seqID from SequencesAnnotationTable WHERE annotationID=%s',
            [cid])
        res2 = cur.fetchall()
        cprimerid = None
        for cres2 in res2:
            cseqid = cres2['seqid']
            cur.execute(
                'SELECT idPrimer from SequencesTable WHERE id=%s LIMIT 1',
                [cseqid])
            res3 = cur.fetchone()
            if cprimerid is None:
                cprimerid = res3['idprimer']
            if res3['idprimer'] != cprimerid:
                debug(
                    8,
                    'annotation %d contains sequences from two different regions'
                    % cid)
                cprimerid = None
                break
        if cprimerid is None:
            debug(
                7,
                "didn't find primer region for annotation %d. skipping" % cid)
            continue
        debug(2, 'annotation %d primer region %d' % (cid, cprimerid))
        cur.execute('UPDATE AnnotationsTable SET primerID=%s WHERE id=%s',
                    [cprimerid, cid])
    debug(3, 'found %d annotations' % idx)
    if commit:
        debug(3, 'committing changes to database')
        con.commit()
    debug(3, 'finished')