Example #1
0
def _annotate(vcf_id):
    if vcf_id == False:
        return  # An error must have occurred earlier.

    engine = sqlalchemy.create_engine(DATABASE_URI)
    # See if we have any guesses for a release version
    with tables(engine, 'vcfs') as (con, vcfs):
        vcf = vcfs.select().where(vcfs.c.id == vcf_id).execute().fetchone()
        release = vcf['vcf_release']

    if not release:
        release = config.ENSEMBL_RELEASE

    # Only runs the first time for this release.
    EnsemblRelease(release).install()

    with tables(engine, 'genotypes') as (con, genotypes):
        metadata = sqlalchemy.MetaData(bind=con)
        metadata.reflect()
        annotations = get_varcode_annotations(genotypes, vcf_id, release)

        tmp_table = Table('gene_annotations',
                          metadata,
                          Column('contig', Text, nullable=False),
                          Column('position', Integer, nullable=False),
                          Column('reference', Text, nullable=False),
                          Column('alternates', Text, nullable=False),
                          Column('gene_name', Text, nullable=True),
                          Column('transcript', Text, nullable=True),
                          Column('notation', Text, nullable=True),
                          Column('effect_type', Text, nullable=True),
                          prefixes=['TEMPORARY'])

        try:
            tmp_table.create()
            write_to_table_via_csv(tmp_table, rows=annotations, connection=con)
            # Add gene names from temp table to genotypes.
            (genotypes.update()
             .where(genotypes.c.contig == tmp_table.c.contig)
             .where(genotypes.c.position == tmp_table.c.position)
             .where(genotypes.c.reference == tmp_table.c.reference)
             .where(genotypes.c.alternates == tmp_table.c.alternates)
             .where(genotypes.c.vcf_id == vcf_id)
             .values(
                 {
                    'annotations:gene_name': tmp_table.c.gene_name,
                    'annotations:transcript': tmp_table.c.transcript,
                    'annotations:effect_notation': tmp_table.c.notation,
                    'annotations:effect_type': tmp_table.c.effect_type
                 }
             )).execute()
        finally:
            con.execute("DISCARD TEMP")

        # We've added annotations:varcode_*, so update the columns to display.
        update_extant_columns(metadata, con, vcf_id)
        return vcf_id
Example #2
0
def create_bam_with_name(project_id, name, uri='hdfs://testbam.bam'):
    with tables(db.engine, 'bams') as (con, bams):
        res = bams.insert(
            {'name': name,
             'project_id': project_id,
             'uri': uri}).returning(*bams.c).execute()
        return dict(res.fetchone())
Example #3
0
 def put(self, run_id):
     """Update the run by its ID."""
     with tables(db.engine, 'vcfs') as (con, runs):
         q = runs.update(runs.c.id == run_id).values(
             **request.validated_body
         ).returning(*runs.c)
         return dict(_abort_if_none(q.execute().fetchone(), run_id))
Example #4
0
 def put(self, bam_id):
     """Update the BAM by its ID."""
     with tables(db.engine, 'bams') as (con, bams):
         q = bams.update(bams.c.id == bam_id).values(
             **request.validated_body
         ).returning(*bams.c)
         return dict(_abort_if_none(q.execute().fetchone(), bam_id))
Example #5
0
 def get(self, run_id):
     """Return a vcf with a given ID."""
     with tables(db.engine, 'vcfs') as (con, runs):
         q = select(runs.c).where(runs.c.id == run_id)
         run = dict(_abort_if_none(q.execute().fetchone(), run_id))
     bams.attach_bams_to_vcfs([run])
     return run
Example #6
0
 def delete(self, bam_id):
     """Delete a BAM by its ID."""
     with tables(db.engine, 'bams') as (con, bams):
         q = bams.delete(bams.c.id == bam_id).returning(*bams.c)
         q = q.execute()
         res = q.fetchone()
         return dict(_abort_if_none(res, bam_id))
Example #7
0
def get_vcf_comments(vcf_id):
    """Return all user comments in the following format:

    {
      "comments": {
        "<row_key STRING>": [<comment_fields>, ...],
        "<row_key STRING>": [...], ...
      }
    }
    """
    def _row_key(comment, table):
        cols = ['contig', 'position', 'reference', 'alternates', 'sample_name']
        return ':'.join([str(comment[col]) for col in cols])

    with tables(db.engine, 'user_comments') as (conn, user_comments):
        cols = user_comments.c
        stmt = select(cols.values()).where(cols.vcf_id == vcf_id)
        results = conn.execute(stmt)
        results_map = defaultdict(list)
        for comment in (dict(c) for c in results):
            row_key = _row_key(comment, user_comments)
            comment['last_modified'] = to_epoch(comment['last_modified'])
            comment['created'] = to_epoch(comment['created'])
            comment = camelcase_dict(comment)
            results_map[row_key].append(comment)
    return {'comments': results_map}
Example #8
0
    def post(self):
        """Create a new run.

        This will import the VCF's genotypes into the database in a worker, as
        well as annotate it with gene names.
        """
        run = request.validated_body
        try:
            expect_one_of(request.validated_body, 'project_name', 'project_id')
        except voluptuous.MultipleInvalid as e:
            errors = [str(err) for err in e.errors]
            abort(409, message='Validation error', errors=errors)
        try:
            projects.set_and_verify_project_id_on(run)
        except voluptuous.Invalid as e:
            abort(404, message='Project not found.', errors=[str(e)])
        try:
            _set_or_verify_bam_id_on(run, bam_type='normal')
            _set_or_verify_bam_id_on(run, bam_type='tumor')
        except voluptuous.Invalid as e:
            abort(404, message='BAM not found.', errors=[str(e)])

        with tables(db.engine, 'vcfs') as (con, runs):
            run = runs.insert(
                request.validated_body).returning(*runs.c).execute().fetchone()
        workers.runner.start_workers_for_vcf_id(run['id'])
        return dict(run), 201
Example #9
0
def register():
    """Register user from current request.

    Returns the user if successful, else raise."""
    errors = None
    user = None
    try:
        data = RegisterUser(prepare_request_data(request))
        if data.get('password1') != data.get('password2'):
            errors = 'Passwords must match.'
    except voluptuous.MultipleInvalid as err:
        errors = str(err)
    if not errors:
        with tables(db.engine, 'users') as (con, users):
            try:
                password_hash = bcrypt.generate_password_hash(
                    request.form['password1'])
                user = users.insert({
                    'username': request.form['username'],
                    'password': password_hash,
                    'email': request.form['email']
                }).returning(*users.c).execute().fetchone()
            except exc.IntegrityError as e:
                errors = "Can\'t create user: " + str(e)
        if user:
            user = wrap_user(user)
            login_user(user)
            return redirect('/')
    return render_template('register.html', errors=errors)
Example #10
0
def genotypes_for_records(vcf_id, query):
    """Return all genotypes which would appear on a row in a VCF (determined by
    CHROM/POS/REF/ALT) if just one genotype on that row passes the selections in
    `query'.

    This is used to generate the list of genotypes to be transformed into
    vcf.model._Records and then written to a VCF file.
    """
    query = _annotate_query_with_types(query, spec(vcf_id))
    with tables(db.engine, 'genotypes') as (con, gt):
        keyfunc = func.concat(
            gt.c.contig, ':', cast(gt.c.position, types.Unicode), '::',
            gt.c.reference, '->', gt.c.alternates)
        filtered_gts_q = select([keyfunc]).where(gt.c.vcf_id == vcf_id)
        filtered_gts_q = _add_filters(filtered_gts_q, gt, query.get('filters'))
        filtered_gts_q = _add_range(filtered_gts_q, gt, query.get('range'))
        filtered_gts_q = filtered_gts_q.cte('filtered_gts')

        records_q = select([gt]).where(
            keyfunc.in_(select([filtered_gts_q]))).where(gt.c.vcf_id == vcf_id)
        records_q = records_q.order_by(asc(func.length(gt.c.contig)),
                                       asc(gt.c.contig),
                                       asc(gt.c.position),
                                       asc(gt.c.reference),
                                       asc(gt.c.alternates),
                                       asc(gt.c.sample_name))
        genotypes = [dict(g) for g in con.execute(records_q).fetchall()]
    return genotypes
Example #11
0
def _extract(vcf_id):
    """Extract the genotypes from a VCF and insert into the DB.

    This also fills in a few fields in the vcfs table which aren't available
    until the entire VCF has been read, e.g. the variant count.

    Returns the vcf_id, or False if an error occurred.
    """
    engine = sqlalchemy.create_engine(DATABASE_URI)
    with tables(engine, 'vcfs') as (con, vcfs_table):
        metadata = sqlalchemy.MetaData(bind=con)
        metadata.reflect()
        vcf = vcfs_table.select().where(vcfs_table.c.id == vcf_id).execute().fetchone()

        # Validate the contents of the VCF before modifying the database.
        reader, header_text, release = load_vcf(vcf['uri'])

        # Fill in VCF header text, which is now available.
        (vcfs_table.update()
         .where(vcfs_table.c.id == vcf_id)
         .values(vcf_header=header_text, vcf_release=release)
         .execute())

        insert_genotypes_with_copy(reader, engine,
                                   default_values={'vcf_id': vcf_id},
                                   temporary_dir=TEMPORARY_DIR)

        update_extant_columns(metadata, con, vcf_id)
        update_vcf_count(metadata, con, vcf_id)
    return vcf_id
Example #12
0
 def put(self, project_id):
     """Update a project by its ID."""
     with tables(db.engine, 'projects') as (con, projects):
         q = projects.update(projects.c.id == project_id).values(
             **request.validated_body
         ).returning(*projects.c)
         return dict(_abort_if_none(q.execute().fetchone(), project_id))
Example #13
0
def get_related_vcfs(run):
    """Return a list of vcfs in the same project as vcf."""
    with tables(db.engine, 'vcfs') as (con, runs):
        q = select(runs.c).where(
            runs.c.project_id == run['project_id']).where(
                runs.c.id != run['id'])
        return [dict(r) for r in q.execute().fetchall()]
Example #14
0
 def post(self, run_id):
     """Create a comment."""
     with tables(db.engine, 'user_comments') as (con, comments):
         q = comments.insert().values(
             vcf_id=run_id,
             **request.validated_body
         ).returning(*comments.c)
         return dict(q.execute().fetchone()), 201
Example #15
0
def get_last_comments(n=5):
    """Return list of the last `n` comments."""
    with tables(db.engine, 'user_comments') as (con, user_comments):
        cols = user_comments.c
        q = select(cols.values()).order_by(
            desc(cols.created)).limit(n)
        comments = [camelcase_dict(dict(c)) for c in con.execute(q).fetchall()]
    return epochify_comments(comments)
Example #16
0
def comments():
    with tables(db.engine, 'user_comments') as (con, comments):
        comments = comments.select().order_by(desc(comments.c.id))
        comments = [camelcase_dict(dict(c))
                    for c in comments.execute().fetchall()]
        comments = cycledash.api.comments.epochify_comments(comments)
        comments = cycledash.api.comments.add_user_to_comments(comments)
    return render_template('comments.html', comments=comments)
Example #17
0
def restart_failed_tasks_for(vcf_id):
    with tables(db.engine, 'task_states') as (con, tasks):
        q = (tasks.delete()
             .where(tasks.c.vcf_id == vcf_id)
             .where(tasks.c.state == 'FAILURE')
             .returning(tasks.c.type))
        names = [r[0] for r in q.execute().fetchall()]
    workers.runner.restart_failed_tasks(names, vcf_id)
Example #18
0
def delete_all_records(db):
    """Deletes all records in all tables in the given `db`."""
    with tables(db.engine) as (connection,):
        metadata = sqlalchemy.MetaData(bind=connection)
        metadata.reflect()
        # We delete the tables in order of dependency, so that foreign-key
        # relationships don't prevent a table from being deleted.
        for tbl in reversed(metadata.sorted_tables):
            tbl.delete().execute()
Example #19
0
 def delete(self, run_id):
     """Delete a run by its ID."""
     with tables(db.engine, 'vcfs') as (con, runs):
         q = (runs.delete()
              .where(runs.c.id == run_id)
              .returning(*runs.c))
         # TODO: unattach BAMs and projects before deleting?
         # TODO: cascade tasks & genotypes deletion?
         return dict(_abort_if_none(q.execute().fetchone(), run_id))
Example #20
0
 def get(self, run_id):
     with tables(db.engine, 'task_states') as (con, tasks):
         q = (select([tasks.c.task_id, tasks.c.type, tasks.c.state])
              .where(tasks.c.vcf_id == run_id))
         return [{'type': _simplify_type(typ),
                  'state': state,
                  # pylint: disable=too-many-function-args
                  'traceback': worker.AsyncResult(task_id).traceback}
                 for task_id, typ, state in con.execute(q).fetchall()]
Example #21
0
def contigs(vcf_id):
    """Return a sorted list of contig names found in the given vcf."""
    with tables(db.engine, 'genotypes') as (con, genotypes):
        q = (select([genotypes.c.contig])
             .where(genotypes.c.vcf_id == vcf_id)
             .group_by(genotypes.c.contig)
             .order_by(func.length(genotypes.c.contig), genotypes.c.contig))
        results = con.execute(q).fetchall()
    return [contig for (contig,) in results]
Example #22
0
 def post(self):
     """Create a new project."""
     with tables(db.engine, 'projects') as (con, projects):
         try:
             result = projects.insert(
                 request.validated_body
             ).returning(*projects.c).execute()
         except exc.IntegrityError as e:
             abort(409, message='Project cannot be created.', errors=[str(e)])
         return dict(result.fetchone()), 201
Example #23
0
def tasks(run_id):
    with tables(db.engine, 'task_states') as (con, tasks):
        tasks = select([tasks.c.task_id, tasks.c.type, tasks.c.state]).where(
            tasks.c.vcf_id == run_id)
        tasks = [{'type': cycledash.api.tasks._simplify_type(typ),
                  'state': state,
                  # pylint: disable=too-many-function-args
                  'traceback': worker.AsyncResult(task_id).traceback}
                 for task_id, typ, state in tasks.execute().fetchall()]
    return render_template('tasks.html', tasks=tasks, run_id=run_id)
Example #24
0
def register_running_task(task, vcf_id):
    """Record the existence of a Celery task in the database."""
    record = {
        'task_id': task.request.id,
        'type': task.name,
        'state': 'STARTED',
        'vcf_id': vcf_id
    }
    with tables(create_engine(DATABASE_URI), 'task_states') as (con, tasks):
        tasks.insert(record).execute()
Example #25
0
def get(run_id, query, with_stats=True):
    """Return a list of genotypes in a vcf conforming to the given query, as
    well as a dict of stats calculated on them.

    If a truth_vcf is associated with this VCF, stats include true/false,
    positive/negative stats, as well as precision, recall, and f1score. Stats
    also include the number of records, and the number of records once filters
    are applied.

    A query is a dictionary which specifies the range, filters, limit, offset
    and ordering which should be applied against genotypes before genotypes and
    stats are returned.

    It has structure:

    {range: {contig: "X", start: 0, end: 250000000},
     filters: [{columnName: 'info:DP', filterValue: '50', type: '<'}, ...],
     sortBy: [{columnName: 'contig', order: 'asc'},
              {columnName: 'position', order: 'asc'}, ...],
     page: 10,
     limit: 250
    }
    """
    query = _annotate_query_with_types(query, spec(run_id))
    compare_to_run_id = query.get('compareToVcfId')
    with tables(db.engine, 'genotypes') as (con, g):
        if compare_to_run_id:
            # We consider a genotype validated if a truth genotype exists at its
            # location (contig/position) with the same ref/alts.  This isn't
            # entirely accurate: for example, it handles SVs very poorly.
            gt = g.alias()
            joined_q = outerjoin(g, gt, and_(
                gt.c.vcf_id == compare_to_run_id,
                g.c.contig == gt.c.contig,
                g.c.position == gt.c.position,
                g.c.reference == gt.c.reference,
                g.c.alternates == gt.c.alternates,
                g.c.sample_name == gt.c.sample_name))
            valid_column = label('tag:true-positive', gt.c.contig != None)
            q = (select(g.c + [valid_column])
                 .select_from(joined_q)
                 .where(g.c.vcf_id == run_id))
        else:
            q = select(g.c).where(g.c.vcf_id == run_id)

        q = _add_range(q, g, query.get('range'))
        q = _add_filters(q, g, query.get('filters'))
        q = _add_orderings(q, g, query.get('sortBy'))
        q = _add_paging(q, g, query.get('limit'), query.get('page'))

        q = _add_ordering(q, g, 'String', 'contig', 'asc')
        q = _add_ordering(q, g, 'Integer', 'position', 'asc')
        genotypes = [dict(g) for g in con.execute(q).fetchall()]
    stats = calculate_stats(run_id, compare_to_run_id, query) if with_stats else {}
    return {'records': genotypes, 'stats': stats}
Example #26
0
def samples(vcf_id):
    """Return a sorted list of sample names found in the given vcf."""
    query = """SELECT sample_name FROM genotypes WHERE vcf_id = %s
    GROUP BY sample_name ORDER BY sample_name
    """
    with tables(db.engine, 'genotypes') as (con, genotypes):
        samples = (select([func.count(distinct(genotypes.c.sample_name))])
                   .where(genotypes.c.vcf_id == vcf_id))
        samples = [sample_name for (sample_name,)
                   in samples.execute().fetchall()]
    return samples
Example #27
0
    def delete(self, run_id, comment_id):
        """Delete the comment with the given ID.

        Must include `last_modified_timestamp`: if it's different than the
        comment's current `last_modified_timestamp`, the deletion is rejected.
        """
        last_modified = request.validated_body['last_modified']
        with tables(db.engine, 'user_comments') as (conn, comments), \
             conn.begin() as trans:
            comment = _get_comment(comments, id=comment_id, vcf_id=run_id)
            _ensure_not_out_of_date(comment, last_modified)
            comments.delete(comments.c.id == comment_id).execute()
        return comment
Example #28
0
def check_login(username, password):
    """Returns the user if it exists and the password is correct, else None."""
    if not (username and password):
        return None
    user = None
    with tables(db.engine, 'users') as (con, users):
        user = users.select(users.c.username == username).execute().fetchone()
    if user:
        pw_hash = str(user['password'])
        candidate = str(password)
        if bcrypt.check_password_hash(pw_hash, candidate):
            return wrap_user(user)
    return None
Example #29
0
def all_non_success_tasks():
    """Returns a map from vcf_id -> [list of unique states of its tasks]."""
    update_tasks_table()
    with tables(db.engine, 'task_states') as (con, tasks):
        q = (select([tasks.c.vcf_id, tasks.c.state])
                .where(tasks.c.state != 'SUCCESS')
                .distinct())

        # maps vcf_id to current states
        ids_to_states = defaultdict(set)
        for vcf_id, state in con.execute(q).fetchall():
            ids_to_states[vcf_id].add(state)
        return {k: list(v) for k, v in ids_to_states.iteritems()}
Example #30
0
def attach_bams_to_vcfs(vcfs):
    """Attach tumor_bam and normal_bam to all VCFs."""
    with tables(db.engine, 'bams') as (con, bams):
        q = select(bams.c)
        bams = [dict(b) for b in con.execute(q).fetchall()]
    for vcf in vcfs:
        normal_bam_id = vcf.get('normal_bam_id')
        tumor_bam_id = vcf.get('tumor_bam_id')

        vcf['tumor_bam'] = (
            dict(find(bams, lambda x: x.get('id') == tumor_bam_id) or {}))
        vcf['normal_bam'] = (
            dict(find(bams, lambda x: x.get('id') == normal_bam_id) or {}))