def _annotate(vcf_id): if vcf_id == False: return # An error must have occurred earlier. engine = sqlalchemy.create_engine(DATABASE_URI) # See if we have any guesses for a release version with tables(engine, 'vcfs') as (con, vcfs): vcf = vcfs.select().where(vcfs.c.id == vcf_id).execute().fetchone() release = vcf['vcf_release'] if not release: release = config.ENSEMBL_RELEASE # Only runs the first time for this release. EnsemblRelease(release).install() with tables(engine, 'genotypes') as (con, genotypes): metadata = sqlalchemy.MetaData(bind=con) metadata.reflect() annotations = get_varcode_annotations(genotypes, vcf_id, release) tmp_table = Table('gene_annotations', metadata, Column('contig', Text, nullable=False), Column('position', Integer, nullable=False), Column('reference', Text, nullable=False), Column('alternates', Text, nullable=False), Column('gene_name', Text, nullable=True), Column('transcript', Text, nullable=True), Column('notation', Text, nullable=True), Column('effect_type', Text, nullable=True), prefixes=['TEMPORARY']) try: tmp_table.create() write_to_table_via_csv(tmp_table, rows=annotations, connection=con) # Add gene names from temp table to genotypes. (genotypes.update() .where(genotypes.c.contig == tmp_table.c.contig) .where(genotypes.c.position == tmp_table.c.position) .where(genotypes.c.reference == tmp_table.c.reference) .where(genotypes.c.alternates == tmp_table.c.alternates) .where(genotypes.c.vcf_id == vcf_id) .values( { 'annotations:gene_name': tmp_table.c.gene_name, 'annotations:transcript': tmp_table.c.transcript, 'annotations:effect_notation': tmp_table.c.notation, 'annotations:effect_type': tmp_table.c.effect_type } )).execute() finally: con.execute("DISCARD TEMP") # We've added annotations:varcode_*, so update the columns to display. update_extant_columns(metadata, con, vcf_id) return vcf_id
def create_bam_with_name(project_id, name, uri='hdfs://testbam.bam'): with tables(db.engine, 'bams') as (con, bams): res = bams.insert( {'name': name, 'project_id': project_id, 'uri': uri}).returning(*bams.c).execute() return dict(res.fetchone())
def put(self, run_id): """Update the run by its ID.""" with tables(db.engine, 'vcfs') as (con, runs): q = runs.update(runs.c.id == run_id).values( **request.validated_body ).returning(*runs.c) return dict(_abort_if_none(q.execute().fetchone(), run_id))
def put(self, bam_id): """Update the BAM by its ID.""" with tables(db.engine, 'bams') as (con, bams): q = bams.update(bams.c.id == bam_id).values( **request.validated_body ).returning(*bams.c) return dict(_abort_if_none(q.execute().fetchone(), bam_id))
def get(self, run_id): """Return a vcf with a given ID.""" with tables(db.engine, 'vcfs') as (con, runs): q = select(runs.c).where(runs.c.id == run_id) run = dict(_abort_if_none(q.execute().fetchone(), run_id)) bams.attach_bams_to_vcfs([run]) return run
def delete(self, bam_id): """Delete a BAM by its ID.""" with tables(db.engine, 'bams') as (con, bams): q = bams.delete(bams.c.id == bam_id).returning(*bams.c) q = q.execute() res = q.fetchone() return dict(_abort_if_none(res, bam_id))
def get_vcf_comments(vcf_id): """Return all user comments in the following format: { "comments": { "<row_key STRING>": [<comment_fields>, ...], "<row_key STRING>": [...], ... } } """ def _row_key(comment, table): cols = ['contig', 'position', 'reference', 'alternates', 'sample_name'] return ':'.join([str(comment[col]) for col in cols]) with tables(db.engine, 'user_comments') as (conn, user_comments): cols = user_comments.c stmt = select(cols.values()).where(cols.vcf_id == vcf_id) results = conn.execute(stmt) results_map = defaultdict(list) for comment in (dict(c) for c in results): row_key = _row_key(comment, user_comments) comment['last_modified'] = to_epoch(comment['last_modified']) comment['created'] = to_epoch(comment['created']) comment = camelcase_dict(comment) results_map[row_key].append(comment) return {'comments': results_map}
def post(self): """Create a new run. This will import the VCF's genotypes into the database in a worker, as well as annotate it with gene names. """ run = request.validated_body try: expect_one_of(request.validated_body, 'project_name', 'project_id') except voluptuous.MultipleInvalid as e: errors = [str(err) for err in e.errors] abort(409, message='Validation error', errors=errors) try: projects.set_and_verify_project_id_on(run) except voluptuous.Invalid as e: abort(404, message='Project not found.', errors=[str(e)]) try: _set_or_verify_bam_id_on(run, bam_type='normal') _set_or_verify_bam_id_on(run, bam_type='tumor') except voluptuous.Invalid as e: abort(404, message='BAM not found.', errors=[str(e)]) with tables(db.engine, 'vcfs') as (con, runs): run = runs.insert( request.validated_body).returning(*runs.c).execute().fetchone() workers.runner.start_workers_for_vcf_id(run['id']) return dict(run), 201
def register(): """Register user from current request. Returns the user if successful, else raise.""" errors = None user = None try: data = RegisterUser(prepare_request_data(request)) if data.get('password1') != data.get('password2'): errors = 'Passwords must match.' except voluptuous.MultipleInvalid as err: errors = str(err) if not errors: with tables(db.engine, 'users') as (con, users): try: password_hash = bcrypt.generate_password_hash( request.form['password1']) user = users.insert({ 'username': request.form['username'], 'password': password_hash, 'email': request.form['email'] }).returning(*users.c).execute().fetchone() except exc.IntegrityError as e: errors = "Can\'t create user: " + str(e) if user: user = wrap_user(user) login_user(user) return redirect('/') return render_template('register.html', errors=errors)
def genotypes_for_records(vcf_id, query): """Return all genotypes which would appear on a row in a VCF (determined by CHROM/POS/REF/ALT) if just one genotype on that row passes the selections in `query'. This is used to generate the list of genotypes to be transformed into vcf.model._Records and then written to a VCF file. """ query = _annotate_query_with_types(query, spec(vcf_id)) with tables(db.engine, 'genotypes') as (con, gt): keyfunc = func.concat( gt.c.contig, ':', cast(gt.c.position, types.Unicode), '::', gt.c.reference, '->', gt.c.alternates) filtered_gts_q = select([keyfunc]).where(gt.c.vcf_id == vcf_id) filtered_gts_q = _add_filters(filtered_gts_q, gt, query.get('filters')) filtered_gts_q = _add_range(filtered_gts_q, gt, query.get('range')) filtered_gts_q = filtered_gts_q.cte('filtered_gts') records_q = select([gt]).where( keyfunc.in_(select([filtered_gts_q]))).where(gt.c.vcf_id == vcf_id) records_q = records_q.order_by(asc(func.length(gt.c.contig)), asc(gt.c.contig), asc(gt.c.position), asc(gt.c.reference), asc(gt.c.alternates), asc(gt.c.sample_name)) genotypes = [dict(g) for g in con.execute(records_q).fetchall()] return genotypes
def _extract(vcf_id): """Extract the genotypes from a VCF and insert into the DB. This also fills in a few fields in the vcfs table which aren't available until the entire VCF has been read, e.g. the variant count. Returns the vcf_id, or False if an error occurred. """ engine = sqlalchemy.create_engine(DATABASE_URI) with tables(engine, 'vcfs') as (con, vcfs_table): metadata = sqlalchemy.MetaData(bind=con) metadata.reflect() vcf = vcfs_table.select().where(vcfs_table.c.id == vcf_id).execute().fetchone() # Validate the contents of the VCF before modifying the database. reader, header_text, release = load_vcf(vcf['uri']) # Fill in VCF header text, which is now available. (vcfs_table.update() .where(vcfs_table.c.id == vcf_id) .values(vcf_header=header_text, vcf_release=release) .execute()) insert_genotypes_with_copy(reader, engine, default_values={'vcf_id': vcf_id}, temporary_dir=TEMPORARY_DIR) update_extant_columns(metadata, con, vcf_id) update_vcf_count(metadata, con, vcf_id) return vcf_id
def put(self, project_id): """Update a project by its ID.""" with tables(db.engine, 'projects') as (con, projects): q = projects.update(projects.c.id == project_id).values( **request.validated_body ).returning(*projects.c) return dict(_abort_if_none(q.execute().fetchone(), project_id))
def get_related_vcfs(run): """Return a list of vcfs in the same project as vcf.""" with tables(db.engine, 'vcfs') as (con, runs): q = select(runs.c).where( runs.c.project_id == run['project_id']).where( runs.c.id != run['id']) return [dict(r) for r in q.execute().fetchall()]
def post(self, run_id): """Create a comment.""" with tables(db.engine, 'user_comments') as (con, comments): q = comments.insert().values( vcf_id=run_id, **request.validated_body ).returning(*comments.c) return dict(q.execute().fetchone()), 201
def get_last_comments(n=5): """Return list of the last `n` comments.""" with tables(db.engine, 'user_comments') as (con, user_comments): cols = user_comments.c q = select(cols.values()).order_by( desc(cols.created)).limit(n) comments = [camelcase_dict(dict(c)) for c in con.execute(q).fetchall()] return epochify_comments(comments)
def comments(): with tables(db.engine, 'user_comments') as (con, comments): comments = comments.select().order_by(desc(comments.c.id)) comments = [camelcase_dict(dict(c)) for c in comments.execute().fetchall()] comments = cycledash.api.comments.epochify_comments(comments) comments = cycledash.api.comments.add_user_to_comments(comments) return render_template('comments.html', comments=comments)
def restart_failed_tasks_for(vcf_id): with tables(db.engine, 'task_states') as (con, tasks): q = (tasks.delete() .where(tasks.c.vcf_id == vcf_id) .where(tasks.c.state == 'FAILURE') .returning(tasks.c.type)) names = [r[0] for r in q.execute().fetchall()] workers.runner.restart_failed_tasks(names, vcf_id)
def delete_all_records(db): """Deletes all records in all tables in the given `db`.""" with tables(db.engine) as (connection,): metadata = sqlalchemy.MetaData(bind=connection) metadata.reflect() # We delete the tables in order of dependency, so that foreign-key # relationships don't prevent a table from being deleted. for tbl in reversed(metadata.sorted_tables): tbl.delete().execute()
def delete(self, run_id): """Delete a run by its ID.""" with tables(db.engine, 'vcfs') as (con, runs): q = (runs.delete() .where(runs.c.id == run_id) .returning(*runs.c)) # TODO: unattach BAMs and projects before deleting? # TODO: cascade tasks & genotypes deletion? return dict(_abort_if_none(q.execute().fetchone(), run_id))
def get(self, run_id): with tables(db.engine, 'task_states') as (con, tasks): q = (select([tasks.c.task_id, tasks.c.type, tasks.c.state]) .where(tasks.c.vcf_id == run_id)) return [{'type': _simplify_type(typ), 'state': state, # pylint: disable=too-many-function-args 'traceback': worker.AsyncResult(task_id).traceback} for task_id, typ, state in con.execute(q).fetchall()]
def contigs(vcf_id): """Return a sorted list of contig names found in the given vcf.""" with tables(db.engine, 'genotypes') as (con, genotypes): q = (select([genotypes.c.contig]) .where(genotypes.c.vcf_id == vcf_id) .group_by(genotypes.c.contig) .order_by(func.length(genotypes.c.contig), genotypes.c.contig)) results = con.execute(q).fetchall() return [contig for (contig,) in results]
def post(self): """Create a new project.""" with tables(db.engine, 'projects') as (con, projects): try: result = projects.insert( request.validated_body ).returning(*projects.c).execute() except exc.IntegrityError as e: abort(409, message='Project cannot be created.', errors=[str(e)]) return dict(result.fetchone()), 201
def tasks(run_id): with tables(db.engine, 'task_states') as (con, tasks): tasks = select([tasks.c.task_id, tasks.c.type, tasks.c.state]).where( tasks.c.vcf_id == run_id) tasks = [{'type': cycledash.api.tasks._simplify_type(typ), 'state': state, # pylint: disable=too-many-function-args 'traceback': worker.AsyncResult(task_id).traceback} for task_id, typ, state in tasks.execute().fetchall()] return render_template('tasks.html', tasks=tasks, run_id=run_id)
def register_running_task(task, vcf_id): """Record the existence of a Celery task in the database.""" record = { 'task_id': task.request.id, 'type': task.name, 'state': 'STARTED', 'vcf_id': vcf_id } with tables(create_engine(DATABASE_URI), 'task_states') as (con, tasks): tasks.insert(record).execute()
def get(run_id, query, with_stats=True): """Return a list of genotypes in a vcf conforming to the given query, as well as a dict of stats calculated on them. If a truth_vcf is associated with this VCF, stats include true/false, positive/negative stats, as well as precision, recall, and f1score. Stats also include the number of records, and the number of records once filters are applied. A query is a dictionary which specifies the range, filters, limit, offset and ordering which should be applied against genotypes before genotypes and stats are returned. It has structure: {range: {contig: "X", start: 0, end: 250000000}, filters: [{columnName: 'info:DP', filterValue: '50', type: '<'}, ...], sortBy: [{columnName: 'contig', order: 'asc'}, {columnName: 'position', order: 'asc'}, ...], page: 10, limit: 250 } """ query = _annotate_query_with_types(query, spec(run_id)) compare_to_run_id = query.get('compareToVcfId') with tables(db.engine, 'genotypes') as (con, g): if compare_to_run_id: # We consider a genotype validated if a truth genotype exists at its # location (contig/position) with the same ref/alts. This isn't # entirely accurate: for example, it handles SVs very poorly. gt = g.alias() joined_q = outerjoin(g, gt, and_( gt.c.vcf_id == compare_to_run_id, g.c.contig == gt.c.contig, g.c.position == gt.c.position, g.c.reference == gt.c.reference, g.c.alternates == gt.c.alternates, g.c.sample_name == gt.c.sample_name)) valid_column = label('tag:true-positive', gt.c.contig != None) q = (select(g.c + [valid_column]) .select_from(joined_q) .where(g.c.vcf_id == run_id)) else: q = select(g.c).where(g.c.vcf_id == run_id) q = _add_range(q, g, query.get('range')) q = _add_filters(q, g, query.get('filters')) q = _add_orderings(q, g, query.get('sortBy')) q = _add_paging(q, g, query.get('limit'), query.get('page')) q = _add_ordering(q, g, 'String', 'contig', 'asc') q = _add_ordering(q, g, 'Integer', 'position', 'asc') genotypes = [dict(g) for g in con.execute(q).fetchall()] stats = calculate_stats(run_id, compare_to_run_id, query) if with_stats else {} return {'records': genotypes, 'stats': stats}
def samples(vcf_id): """Return a sorted list of sample names found in the given vcf.""" query = """SELECT sample_name FROM genotypes WHERE vcf_id = %s GROUP BY sample_name ORDER BY sample_name """ with tables(db.engine, 'genotypes') as (con, genotypes): samples = (select([func.count(distinct(genotypes.c.sample_name))]) .where(genotypes.c.vcf_id == vcf_id)) samples = [sample_name for (sample_name,) in samples.execute().fetchall()] return samples
def delete(self, run_id, comment_id): """Delete the comment with the given ID. Must include `last_modified_timestamp`: if it's different than the comment's current `last_modified_timestamp`, the deletion is rejected. """ last_modified = request.validated_body['last_modified'] with tables(db.engine, 'user_comments') as (conn, comments), \ conn.begin() as trans: comment = _get_comment(comments, id=comment_id, vcf_id=run_id) _ensure_not_out_of_date(comment, last_modified) comments.delete(comments.c.id == comment_id).execute() return comment
def check_login(username, password): """Returns the user if it exists and the password is correct, else None.""" if not (username and password): return None user = None with tables(db.engine, 'users') as (con, users): user = users.select(users.c.username == username).execute().fetchone() if user: pw_hash = str(user['password']) candidate = str(password) if bcrypt.check_password_hash(pw_hash, candidate): return wrap_user(user) return None
def all_non_success_tasks(): """Returns a map from vcf_id -> [list of unique states of its tasks].""" update_tasks_table() with tables(db.engine, 'task_states') as (con, tasks): q = (select([tasks.c.vcf_id, tasks.c.state]) .where(tasks.c.state != 'SUCCESS') .distinct()) # maps vcf_id to current states ids_to_states = defaultdict(set) for vcf_id, state in con.execute(q).fetchall(): ids_to_states[vcf_id].add(state) return {k: list(v) for k, v in ids_to_states.iteritems()}
def attach_bams_to_vcfs(vcfs): """Attach tumor_bam and normal_bam to all VCFs.""" with tables(db.engine, 'bams') as (con, bams): q = select(bams.c) bams = [dict(b) for b in con.execute(q).fetchall()] for vcf in vcfs: normal_bam_id = vcf.get('normal_bam_id') tumor_bam_id = vcf.get('tumor_bam_id') vcf['tumor_bam'] = ( dict(find(bams, lambda x: x.get('id') == tumor_bam_id) or {})) vcf['normal_bam'] = ( dict(find(bams, lambda x: x.get('id') == normal_bam_id) or {}))