Esempio n. 1
0
def write_pooled_clones(session,
                        out_format,
                        sample_ids=None,
                        pool_on=('sample', ),
                        zipped=False,
                        **kwargs):
    # Samples and subjects can't be combined with other features
    exclusives = set(pool_on).intersection(set(('sample', 'subject')))
    if len(pool_on) > 1 and exclusives:
        pool_on = (list(exclusives)[0], )
        logger.warning('You specified pooling on {feat} which '
                       'cannot be combined with other features.'
                       '  Using only {feat}.'.format(feat=pool_on[0]))

    logger.info('Writing clones pooled by {} in {} format'.format(
        ','.join(pool_on), out_format))

    sample_ids = sample_ids or [s.id for s in session.query(Sample)]
    aggregated = get_pooled_samples(session, sample_ids, pool_on)

    output_func = {
        'immunedb': get_immunedb_output,
        'vdjtools': get_vdjtools_output
    }[out_format]
    with ExportWriter(zipped=zipped) as fh:
        for (subject, feature_value), clones in aggregated.items():
            logger.info('Pooling subject {} for feature(s) {}'.format(
                subject, ','.join(feature_value)))
            fh.set_filename(get_filename(subject, pool_on, feature_value))
            fh.write(output_func(session, clones))
        return fh.get_zip_value()
Esempio n. 2
0
def parse_metadata(session, fh, warn_existing, warn_missing, path):
    reader = csv.DictReader(fh, delimiter='\t')
    provided_fields = set(reader.fieldnames)
    for field in provided_fields:
        if not re.match('^[a-z][a-z0-9_]*$', field):
            raise MetadataException(
                'Metadata headers must only contain numbers, letters, and '
                'underscores, and cannot start with a number: {}'.format(
                    field))

    missing_fields = set(REQUIRED_FIELDS) - provided_fields
    if len(missing_fields) > 0:
        raise MetadataException(
            'Metadata is missing the following headers: {}'.format(
                ','.join(missing_fields)))

    metadata = {}
    for row in reader:
        row = {
            k: v
            for k, v in row.items()
            if v is not None and len(v) > 0 and v.lower() not in NA_VALUES
        }
        if len(row) == 0:
            continue
        check_populated(row)
        # Check if the sample name is unique
        if row['sample_name'] in metadata:
            raise MetadataException(
                'Duplicate sample name {} in metadata.'.format(
                    row['sample_name']))

        # Check if a sample with the same name is in the database
        sample_in_db = session.query(Sample).filter(
            Sample.name == row['sample_name'],
            exists().where(Sequence.sample_id == Sample.id)).first()
        if sample_in_db:
            message = 'Sample {} already exists. {}'.format(
                row['sample_name'],
                'Skipping.' if warn_existing else 'Cannot continue.')
            if warn_existing:
                logger.warning(message)
                continue
            else:
                raise MetadataException(message)

        # Check if specified file exists
        if not os.path.isfile(os.path.join(path, row['file_name'])):
            message = ('File {} for sample {} does not exist. {}'.format(
                row['file_name'], row['sample_name'],
                'Skipping.' if warn_missing else 'Cannot continue.'))
            if warn_missing:
                logger.warning(message)
                continue
            else:
                raise MetadataException(message)

        metadata[row['sample_name']] = row

    return metadata
Esempio n. 3
0
def create(main_parser, args):
    if re.search(r'[^A-Za-z0-9_-]', args.db_name) is not None:
        main_parser.error('Database name must only contain letters, numbers, '
                          'dashes and underscores.')

    try:
        conn = _get_root_connection(args.db_host, args.admin_user,
                                    args.admin_pass)

        db_user = args.db_user or args.db_name
        if args.db_pass:
            db_pass = args.db_pass
        else:
            db_pass = ''.join(
                random.choice(string.ascii_uppercase + string.ascii_lowercase +
                              string.digits) for _ in range(10))

        with conn.cursor() as cursor:
            logger.info('Creating user "{}"'.format(db_user))
            existing_password = _create_user_if_not_exists(conn, '%', db_user,
                                                           db_pass)
            if existing_password is not None:
                if not args.db_pass:
                    logger.warning(
                        'User {} already exists.  To generate the '
                        'configuration file, you must enter it\'s '
                        'password.'.format(db_user)
                    )
                    db_pass = _get_user_pass(conn, args.db_host, db_user,
                                             existing_password)
                else:
                    db_pass = args.db_pass

            logger.info('Creating database "{}"'.format(args.db_name))
            cursor.execute('CREATE DATABASE {}'.format(args.db_name))

            cursor.execute(
                'GRANT ALL PRIVILEGES ON {}.* TO \'{}\'@\'%\''.format(
                    args.db_name, db_user))

        config_path = os.path.join(args.config_dir, '{}.json'.format(
            args.db_name))
        logger.info('Creating config at {}'.format(config_path))
        with open(config_path, 'w+') as fh:
            json.dump({
                'host': args.db_host,
                'database': args.db_name,
                'username': db_user,
                'password': db_pass
            }, fh, sort_keys=True, indent=4, separators=(',', ': '))

        logger.info('Initializing tables')
        config.init_db(config_path)
        logger.info('Success!')
        return True
    except Exception as e:
        logger.error(e)
        return False
Esempio n. 4
0
def parse_metadata(session, fh, warn_existing, path):
    reader = csv.DictReader(fh, delimiter='\t')
    provided_fields = set(reader.fieldnames)
    missing_fields = set(REQUIRED_FIELDS) - provided_fields
    if len(missing_fields) > 0:
        raise MetadataException(
            'Metadata is missing the following headers: {}'.format(
                ','.join(missing_fields)))
    unknown_fields = provided_fields - (set(REQUIRED_FIELDS).union(
        set(OPTIONAL_FIELDS)))
    if len(unknown_fields) > 0:
        logger.warning('Ignoring unknown headers in metadata: {}'.format(
            ','.join(unknown_fields)))

    metadata = {}
    for row in reader:
        row = {
            k: v
            for k, v in row.iteritems() if v is not None and len(v) > 0
        }
        if len(row) == 0:
            continue
        check_populated(row)
        # Check if the sample name is unique
        if row['sample_name'] in metadata:
            logger.error('Duplicate sample name {} in metadata.'.format(
                row['sample_name']))

        # Check if a sample with the same name is in the database
        sample_in_db = session.query(Sample).filter(
            Sample.name == row['sample_name'],
            exists().where(Sequence.sample_id == Sample.id)).first()
        if sample_in_db:
            message = 'Sample {} already exists. {}'.format(
                row['sample_name'],
                'Skipping.' if warn_existing else 'Cannot continue.')
            if warn_existing:
                logger.warning(message)
                continue
            else:
                raise MetadataException(message)

        # Check if specified file exists
        if not os.path.isfile(os.path.join(path, row['file_name'])):
            raise MetadataException(
                'File {} for sample {} does not exist'.format(
                    row['file_name'], row['sample_name']))

        metadata[row['sample_name']] = row

    return metadata
def restore(main_parser, args):
    with open(args.db_config) as fh:
        db_config = json.load(fh)
    with open(args.backup_path, 'r') as fh:
        cmd = shlex.split(
            'mysql -h {} -u {} -p{} {}'.format(db_config['host'],
                                               db_config['username'],
                                               db_config['password'],
                                               db_config['database']))
        proc = subprocess.Popen(cmd, stdin=fh, stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        stdout, stderr = proc.communicate()
        if stderr:
            logger.warning(stderr)
    return True
Esempio n. 6
0
def restore(main_parser, args):
    with open(args.db_config) as fh:
        db_config = json.load(fh)
    with open(args.backup_path, 'r') as fh:
        cmd = shlex.split('mysql -h {} -u {} -p{} {}'.format(
            db_config['host'], db_config['username'], db_config['password'],
            db_config['database']))
        proc = subprocess.Popen(cmd,
                                stdin=fh,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        stdout, stderr = proc.communicate()
        if stderr:
            logger.warning(stderr)
    return True
Esempio n. 7
0
def write_clone_overlap(session,
                        sample_ids=None,
                        pool_on=('sample', ),
                        size_metric='copies',
                        sim_func='cosine',
                        agg_func='median',
                        zipped=False,
                        **kwargs):
    samples = session.query(Sample)
    if sample_ids:
        samples = samples.filter(Sample.id.in_(sample_ids))
    sample_instances = {s.id: s for s in samples}

    with ExportWriter(zipped=zipped) as writer:
        for subject in set([s.subject for s in sample_instances.values()]):
            logger.info('Calculating overlap for {}'.format(
                subject.identifier))
            sub_samples = [
                s.id for s in sample_instances.values() if s.subject == subject
            ]
            sdf = get_sample_df(session, sub_samples, pool_on, size_metric,
                                getattr(distance, sim_func))
            if sdf.empty:
                logger.warning(
                    'Subject {} had no clones for calculation'.format(
                        subject.identifier))
                continue

            sdf = collapse_df_features(sdf, pool_on, sample_instances,
                                       getattr(np, agg_func))
            name = '{}.overlap'.format(subject.identifier)

            with writer.get_handle(name + '.tsv') as fh:
                sdf.to_csv(fh, sep='\t')

            title_fmt = 'Subject {}\npooled by={}, similarity metric={}'
            if 'sample' not in pool_on:
                title_fmt += ', aggregation function={}'
            fig, ax = plt.subplots(figsize=(20, 20))
            ax = sns.heatmap(sdf, annot=True, linewidths=.25, vmin=0, vmax=1)
            ax.set_title(
                title_fmt.format(subject.identifier, ' & '.join(pool_on),
                                 sim_func, agg_func))

            with writer.get_handle(name + '.pdf', 'wb+') as fh:
                plt.savefig(fh, bbox_inches='tight', format='pdf')

        return writer.get_zip_value()
Esempio n. 8
0
    def do_task(self, clone_id):
        clone_inst = self.session.query(Clone).filter(
            Clone.id == clone_id).first()
        if not clone_inst:
            return

        self.info('Running clone {}'.format(clone_inst.id))

        sequences = self.session.query(
            Sequence
        ).join(SequenceCollapse).filter(
            Sequence.clone_id == clone_id,
            SequenceCollapse.copy_number_in_subject >= self.min_seq_copies,
            SequenceCollapse.samples_in_subject >= self.min_seq_samples,
        )

        if self.exclude_stops:
            sequences = sequences.filter(Sequence.stop == 0)

        sequences = sequences.order_by(Sequence.v_length)

        try:
            tree = self.get_tree(clone_inst, sequences)
            if not tree:
                logger.warning('No sequences to make tree for clone {}'.format(
                    clone_id))
                return
        except Exception as e:
            logger.error('Error running clone {}: {}'.format(clone_id, e))
            return

        for node_id, node in enumerate(tree.traverse()):
            node.add_feature('node_id', node_id)
        final = {
            'info': {
                'min_mut_copies': self.min_mut_copies,
                'min_mut_samples': self.min_mut_samples,
                'min_seq_copies': self.min_seq_copies,
                'min_seq_samples': self.min_seq_samples,
                'exclude_stops': self.exclude_stops,
                'full_seq': self.full_seq,
            },
            'tree': tree_as_dict(tree)
        }
        clone_inst.tree = json.dumps(final)
        self.session.add(clone_inst)
        self.session.commit()
Esempio n. 9
0
    def do_task(self, clone_id):
        clone_inst = self.session.query(Clone).filter(
            Clone.id == clone_id).first()
        if not clone_inst:
            return

        self.info('Running clone {}'.format(clone_inst.id))

        sequences = self.session.query(Sequence).join(SequenceCollapse).filter(
            Sequence.clone_id == clone_id,
            SequenceCollapse.copy_number_in_subject >= self.min_seq_copies,
            SequenceCollapse.samples_in_subject >= self.min_seq_samples,
        )

        if self.exclude_stops:
            sequences = sequences.filter(Sequence.stop == 0)

        sequences = sequences.order_by(Sequence.v_length)

        try:
            tree = self.get_tree(clone_inst, sequences)
            if not tree:
                logger.warning(
                    'No sequences to make tree for clone {}'.format(clone_id))
                return
        except Exception as e:
            logger.error('Error running clone {}: {}'.format(clone_id, e))
            return

        for node_id, node in enumerate(tree.traverse()):
            node.add_feature('node_id', node_id)
        final = {
            'info': {
                'min_mut_copies': self.min_mut_copies,
                'min_mut_samples': self.min_mut_samples,
                'min_seq_copies': self.min_seq_copies,
                'min_seq_samples': self.min_seq_samples,
                'exclude_stops': self.exclude_stops,
                'full_seq': self.full_seq,
            },
            'tree': tree_as_dict(tree)
        }
        clone_inst.tree = json.dumps(final)
        self.session.add(clone_inst)
        self.session.commit()
Esempio n. 10
0
def collapse_duplicate_alignments(bucket):
    uniques = []
    while bucket:
        alignment = bucket.pop()
        for i, other_alignment in enumerate(bucket):
            if (len(alignment.sequence.sequence) != len(
                    other_alignment.sequence.sequence)):
                logger.warning('Sequence lengths differ {} {}'.format(
                    alignment.sequence.seq_id,
                    other_alignment.sequence.seq_id))
                continue
            if dnautils.equal(alignment.sequence.sequence,
                              other_alignment.sequence.sequence):
                alignment.sequence.copy_number += (
                    other_alignment.sequence.copy_number)
                bucket.pop(i)
        uniques.append(alignment)
    return uniques
def write_clone_overlap(session, sample_ids=None, pool_on=('sample',),
                        size_metric='copies', sim_func='cosine',
                        agg_func='median', zipped=False, **kwargs):
    samples = session.query(Sample)
    if sample_ids:
        samples = samples.filter(Sample.id.in_(sample_ids))
    sample_instances = {s.id: s for s in samples}

    with ExportWriter(zipped=zipped) as writer:
        for subject in set([s.subject for s in sample_instances.values()]):
            logger.info('Calculating overlap for {}'.format(
                subject.identifier))
            sub_samples = [
                s.id for s in sample_instances.values() if s.subject == subject
            ]
            sdf = get_sample_df(session, sub_samples, pool_on, size_metric,
                                getattr(distance, sim_func))
            if sdf.empty:
                logger.warning(
                    'Subject {} had no clones for calculation'.format(
                        subject.identifier))
                continue

            sdf = collapse_df_features(sdf, pool_on, sample_instances,
                                       getattr(np, agg_func))
            name = '{}.overlap'.format(subject.identifier)

            with writer.get_handle(name + '.tsv') as fh:
                sdf.to_csv(fh, sep='\t')

            title_fmt = 'Subject {}\npooled by={}, similarity metric={}'
            if 'sample' not in pool_on:
                title_fmt += ', aggregation function={}'
            fig, ax = plt.subplots(figsize=(20, 20))
            ax = sns.heatmap(sdf, annot=True, linewidths=.25, vmin=0, vmax=1)
            ax.set_title(title_fmt.format(
                subject.identifier, ' & '.join(pool_on), sim_func, agg_func
            ))

            with writer.get_handle(name + '.pdf', 'wb+') as fh:
                plt.savefig(fh, bbox_inches='tight', format='pdf')

        return writer.get_zip_value()
Esempio n. 12
0
def delete_samples(session, args):
    for sample_id in args.sample_ids:
        sample = session.query(Sample).get(sample_id)
        if not sample:
            logger.warning('Sample #{} does not exist'.format(sample_id))
            continue

        logger.info('Deleting sample #{}'.format(sample_id))
        sample.subject.reset()
        session.query(Sequence).filter(Sequence.sample == sample).delete()
        session.delete(sample)
    session.commit()

    for subject in session.query(Subject):
        if not subject.samples:
            logger.info('Deleting orphan subject "{}"'.format(
                subject.identifier))
            session.delete(subject)
    session.commit()
Esempio n. 13
0
def update_metadata(session, args):
    SENTINEL = '__TEMP'  # Used to temporarily avoid duplicate name issues
    with open(args.new_metadata) as fh:
        reader = csv.DictReader(fh, delimiter='\t')
        new_meta = {l['name']: l for l in reader}

    # delete existing metadata
    sample_ids = {
        s.name: s.id
        for s in session.query(Sample).filter(Sample.name.in_(new_meta))
    }

    session.query(SampleMetadata).filter(
        SampleMetadata.sample_id.in_(
            sample_ids.values())).delete(synchronize_session='fetch')

    ignore_fields = ['name', 'new_name', 'subject', 'file_name']
    for sample_name, row in new_meta.items():
        if sample_name not in sample_ids:
            logger.warning(
                'No sample {} in database.  Ignoring.'.format(sample_name))
        sample_id = sample_ids[sample_name]
        logger.info('Updating metadata for {}'.format(row['name']))
        session.add_all([
            SampleMetadata(sample_id=sample_id, key=k, value=v)
            for k, v in row.items()
            if k not in ignore_fields and v not in NA_VALUES
        ])
        if row['new_name'] != row['name']:
            logger.info('  Updating sample name to {}'.format(row['new_name']))
            session.query(Sample).filter(Sample.name == row['name']).update(
                {Sample.name: row['new_name'] + SENTINEL})

    logger.info('Verifying uniqueness')
    for sample in session.query(Sample).filter(Sample.name.like('%' +
                                                                SENTINEL)):
        sample.name = sample.name[:-len(SENTINEL)]

    if session.query(Clone.id).filter(~Clone.tree.is_(None)).count() > 0:
        logger.warning('This database has at least one clonal lineage '
                       'constructed.  All lineages will need to be updated '
                       'to reflect the modified metadata.')
    session.commit()
def update_metadata(session, args):
    SENTINEL = '__TEMP'  # Used to temporarily avoid duplicate name issues
    with open(args.new_metadata) as fh:
        reader = csv.DictReader(fh, delimiter='\t')
        new_meta = {l['name']: l for l in reader}

    # delete existing metadata
    sample_ids = {s.name: s.id for s in session.query(Sample).filter(
        Sample.name.in_(new_meta))}

    session.query(SampleMetadata).filter(
        SampleMetadata.sample_id.in_(sample_ids.values())
    ).delete(synchronize_session='fetch')

    ignore_fields = ['name', 'new_name', 'subject', 'file_name']
    for sample_name, row in new_meta.items():
        if sample_name not in sample_ids:
            logger.warning('No sample {} in database.  Ignoring.'.format(
                sample_name))
        sample_id = sample_ids[sample_name]
        logger.info('Updating metadata for {}'.format(row['name']))
        session.add_all([
            SampleMetadata(sample_id=sample_id, key=k, value=v)
            for k, v in row.items() if k not in ignore_fields and v not in
            NA_VALUES
        ])
        if row['new_name'] != row['name']:
            logger.info('  Updating sample name to {}'.format(row['new_name']))
            session.query(Sample).filter(Sample.name == row['name']).update({
                Sample.name: row['new_name'] + SENTINEL
            })

    logger.info('Verifying uniqueness')
    for sample in session.query(Sample).filter(
            Sample.name.like('%' + SENTINEL)):
        sample.name = sample.name[:-len(SENTINEL)]

    if session.query(Clone.id).filter(~Clone.tree.is_(None)).count() > 0:
        logger.warning('This database has at least one clonal lineage '
                       'constructed.  All lineages will need to be updated '
                       'to reflect the modified metadata.')
    session.commit()
Esempio n. 15
0
def collapse_duplicates(bucket):
    uniques = []
    while bucket:
        alignment = bucket.pop()
        for i, other_alignment in enumerate(bucket):
            if (len(alignment.sequence.sequence) !=
                    len(other_alignment.sequence.sequence)):
                logger.warning('Sequence lengths differ {} {}'.format(
                    alignment.sequence.seq_id,
                    other_alignment.sequence.seq_id)
                )
                continue
            if dnautils.equal(alignment.sequence.sequence,
                              other_alignment.sequence.sequence):
                alignment.sequence.copy_number += (
                    other_alignment.sequence.copy_number
                )
                bucket.pop(i)
        uniques.append(alignment)
    return uniques
Esempio n. 16
0
def _queue_tasks(session, sample_id, force, tasks):
    logger.info('Creating task queue to generate stats for sample {}.'.format(
        sample_id))
    existing_seq = session.query(Sequence).filter(
        Sequence.sample_id == sample_id)
    existing_nores = session.query(NoResult).filter(
        NoResult.sample_id == sample_id)
    if existing_seq.first() is None and existing_nores.first() is None:
        logger.warning('\tSKIPPING since there are no sequences in the '
                       'sample')
        return

    existing = session.query(SampleStats.sample_id).filter(
        SampleStats.sample_id == sample_id).first() is not None
    if force and existing:
        logger.warning('\tFORCING regeneration of stats')
    elif not force and existing:
        logger.warning('\tSKIPPING stats since they already exists and the '
                       '--force flag was not specified.')
        return

    min_cdr3, max_cdr3 = _get_cdr3_bounds(session, sample_id)
    for include_outliers in [True, False]:
        for only_full_reads in [True, False]:
            tasks.add_task({
                'func': 'seq',
                'sample_id': sample_id,
                'min_cdr3': min_cdr3,
                'max_cdr3': max_cdr3,
                'include_outliers': include_outliers,
                'only_full_reads': only_full_reads
            })
            tasks.add_task({
                'func': 'clone',
                'sample_id': sample_id,
                'min_cdr3': min_cdr3,
                'max_cdr3': max_cdr3,
                'include_outliers': include_outliers,
                'only_full_reads': only_full_reads
            })
def _queue_tasks(session, sample_id, force, tasks):
    logger.info('Creating task queue to generate stats for sample {}.'.format(
        sample_id))
    existing_seq = session.query(Sequence).filter(
        Sequence.sample_id == sample_id)
    existing_nores = session.query(NoResult).filter(
        NoResult.sample_id == sample_id)
    if existing_seq.first() is None and existing_nores.first() is None:
        logger.warning('\tSKIPPING since there are no sequences in the '
                       'sample')
        return

    existing = session.query(SampleStats.sample_id).filter(
        SampleStats.sample_id == sample_id).first() is not None
    if force and existing:
        logger.warning('\tFORCING regeneration of stats')
    elif not force and existing:
        logger.warning('\tSKIPPING stats since they already exists and the '
                       '--force flag was not specified.')
        return

    min_cdr3, max_cdr3 = _get_cdr3_bounds(session, sample_id)
    for include_outliers in [True, False]:
        for only_full_reads in [True, False]:
            tasks.add_task({
                'func': 'seq',
                'sample_id': sample_id,
                'min_cdr3': min_cdr3,
                'max_cdr3': max_cdr3,
                'include_outliers': include_outliers,
                'only_full_reads': only_full_reads
            })
            tasks.add_task({
                'func': 'clone',
                'sample_id': sample_id,
                'min_cdr3': min_cdr3,
                'max_cdr3': max_cdr3,
                'include_outliers': include_outliers,
                'only_full_reads': only_full_reads
            })
Esempio n. 18
0
def shutdown():
    if app.config['allow_shutdown']:
        logger.warning('Shutting down from remote request')
        sys.exit()
    return create_response(code=404)
def parse_metadata(session, fh, warn_existing, warn_missing, path):
    reader = csv.DictReader(fh, delimiter='\t')
    provided_fields = set(reader.fieldnames)
    for field in provided_fields:
        if not re.match('^[a-z][a-z0-9_]*$', field):
            raise MetadataException(
                'Metadata headers must only contain numbers, letters, and '
                'underscores, and cannot start with a number: {}'.format(field)
            )

    missing_fields = set(REQUIRED_FIELDS) - provided_fields
    if len(missing_fields) > 0:
        raise MetadataException(
            'Metadata is missing the following headers: {}'.format(
                ','.join(missing_fields)))

    metadata = {}
    for row in reader:
        row = {k: v for k, v in row.items()
               if v is not None and len(v) > 0 and v.lower() not in NA_VALUES}
        if len(row) == 0:
            continue
        check_populated(row)
        # Check if the sample name is unique
        if row['sample_name'] in metadata:
            raise MetadataException(
                'Duplicate sample name {} in metadata.'.format(
                    row['sample_name']))

        # Check if a sample with the same name is in the database
        sample_in_db = session.query(Sample).filter(
            Sample.name == row['sample_name'],
            exists().where(
                Sequence.sample_id == Sample.id
            )).first()
        if sample_in_db:
            message = 'Sample {} already exists. {}'.format(
                row['sample_name'],
                'Skipping.' if warn_existing else 'Cannot continue.'
            )
            if warn_existing:
                logger.warning(message)
                continue
            else:
                raise MetadataException(message)

        # Check if specified file exists
        if not os.path.isfile(os.path.join(path, row['file_name'])):
            message = (
                'File {} for sample {} does not exist. {}'.format(
                    row['file_name'], row['sample_name'],
                    'Skipping.' if warn_missing else 'Cannot continue.'))
            if warn_missing:
                logger.warning(message)
                continue
            else:
                raise MetadataException(message)

        metadata[row['sample_name']] = row

    return metadata
Esempio n. 20
0
def process_sample(session, sample, indexes, temp, v_germlines, j_germlines,
                   nproc):
    indels = session.query(
        Sequence.ai,
        Sequence.seq_id,
        Sequence.sample_id,
        Sequence.sequence
    ).filter(
        Sequence.sample_id == sample.id,
        Sequence.probable_indel_or_misalign == 1
    ).order_by(Sequence.seq_id)
    # Get the sequences that were not identifiable
    noresults = session.query(NoResult).filter(
        NoResult.sample_id == sample.id).order_by(NoResult.seq_id)

    if indels.count() == 0 and noresults.count() == 0:
        logger.info('Sample {} has no indels or noresults'.format(
            sample.id))
        return
    logger.info('Sample {} has {} indels and {} noresults'.format(
                sample.id, indels.count(), noresults.count()))

    mut_bucket = v_germlines.mut_bucket(sample.v_ties_mutations)
    len_bucket = v_germlines.length_bucket(sample.v_ties_len)
    bucket = '{}_{}'.format(str(mut_bucket).replace('.', ''),
                            len_bucket)
    sample_v_germlines = get_formatted_ties(v_germlines.all_ties(
            sample.v_ties_len, sample.v_ties_mutations))
    sample_j_germlines = get_formatted_ties(j_germlines.all_ties(
        sample.v_ties_len, sample.v_ties_mutations))
    if bucket not in indexes:
        indexes.add(bucket)
        v_path = os.path.join(temp, 'v_genes_{}'.format(bucket))
        j_path = os.path.join(temp, 'j_genes_{}'.format(bucket))
        logger.info('Creating index for V-ties at {} length, {} '
                    'mutation'.format(len_bucket, mut_bucket))
        build_index(sample_v_germlines, v_path)
        build_index(sample_j_germlines, j_path)

    seq_path = os.path.join(temp, 'll_{}.fasta'.format(sample.id))
    with open(seq_path, 'w+') as fh:
        fh.write(get_fasta({'tp=Sequence|ai={}|sample_id={}|seq_id={}'.format(
                r.ai, r.sample_id, r.seq_id): r.sequence for r in indels}))
        fh.write(get_fasta({'tp=NoResult|pk={}|sample_id={}|seq_id={}'.format(
            r.pk, r.sample_id, r.seq_id): r.sequence for r in noresults}))

    alignments = {}
    logger.info('Running bowtie2 for V-gene sequences')
    for line in get_reader(align_reference(temp, 'v_genes_{}'.format(bucket),
                                           seq_path, nproc)):
        line['ref_offset'] = int(line['ref_offset']) - 1
        ref_gene = line['reference']
        try:
            ref, seq, rem_seqs = create_seqs(
                ref_seq=sample_v_germlines[ref_gene].replace('-', ''),
                min_size=CDR3_OFFSET, **line)
        except KeyError as e:
            logger.warning('bowtie got invalid V: ' + str(e))
            continue
        if len(rem_seqs) == 0:
            continue

        ref, seq, seq_start = add_imgt_gaps(sample_v_germlines[ref_gene], ref,
                                            seq, line['ref_offset'])
        if len(ref) < CDR3_OFFSET:
            continue
        alignments[line['seq_id']] = {
            'v_germline': ref,
            'v_gene': line['reference'],
            'seq_start': seq_start,
            'v_sequence': seq,
            'v_rem_seq': rem_seqs[-1],
            'cdr3_start': len(ref)
        }

    seq_path = os.path.join(temp, 'll_j_{}.fasta'.format(sample.id))
    with open(seq_path, 'w+') as fh:
        seqs = {k: v['v_rem_seq'] for k, v in alignments.items() if
                len(v['v_rem_seq']) > 0}
        fh.write(get_fasta(seqs))

    tasks = []
    logger.info('Running bowtie2 for J-gene sequences')
    for line in get_reader(align_reference(temp, 'j_genes_{}'.format(bucket),
                                           seq_path, nproc)):
        line['ref_offset'] = int(line['ref_offset']) - 1
        ref_gene = line['reference']
        ref, seq, rem_seqs = create_seqs(
            ref_seq=sample_j_germlines[ref_gene].replace('-', ''),
            min_size=j_germlines.upstream_of_cdr3, **line)
        alignments[line['seq_id']]['j_gene'] = line['reference']

        full_seq = (alignments[line['seq_id']]['v_sequence'] +
                    alignments[line['seq_id']]['v_rem_seq'])
        if len(rem_seqs) > 0:
            full_seq = full_seq[:-len(rem_seqs[-1])]

        cdr3_end = len(full_seq)
        if len(ref) < j_germlines.upstream_of_cdr3:
            continue
        for i in range(j_germlines.upstream_of_cdr3):
            if ref[-i] != '-':
                cdr3_end -= 1
        alignments[line['seq_id']]['cdr3_end'] = cdr3_end

        cdr3_length = cdr3_end - alignments[line['seq_id']]['cdr3_start']

        full_germ = (alignments[line['seq_id']]['v_germline'] +
                     (GAP_PLACEHOLDER * cdr3_length))
        j_length = len(full_seq) - len(full_germ)
        if j_length <= 0 or cdr3_length <= 0:
            continue
        full_germ += ref[-j_length:]

        r_type, pk, sample_id, seq_id = [
            v.split('=', 1)[1] for v in line['seq_id'].split('|', 3)]
        insertions = gap_positions(full_germ)
        deletions = gap_positions(full_seq)

        alignment = VDJAlignment(
            VDJSequence(seq_id, full_seq.replace(GAP_PLACEHOLDER, '-'))
        )
        alignment.germline = full_germ.replace(GAP_PLACEHOLDER, '-')
        if len(alignment.germline) != len(alignment.sequence.sequence):
            continue
        alignment.v_gene.add(GeneName(alignments[line['seq_id']]['v_gene']))
        alignment.j_gene.add(GeneName(alignments[line['seq_id']]['j_gene']))
        alignment.seq_offset = alignments[line['seq_id']]['seq_start']
        # TODO: This should really look for a streak like in anchoring
        alignment.germline_cdr3 = '-' * cdr3_length
        gaps_in_seq = alignment.sequence.sequence[
            alignment.seq_start:alignments[line['seq_id']]['cdr3_start']
        ].count('-')
        alignment.v_length = (
            alignments[line['seq_id']]['cdr3_start'] -
            alignment.seq_offset
        ) - gaps_in_seq
        alignment.j_length = j_length
        alignment.v_mutation_fraction = 1 - (alignment.v_match /
                                             alignment.v_length)
        alignment.cdr3_start = alignments[line['seq_id']]['cdr3_start']
        alignment.cdr3_num_nts = cdr3_length
        alignment.post_cdr3_length = j_length
        alignment.insertions = insertions
        alignment.deletions = deletions
        alignment.locally_aligned = True

        tasks.append({
            'r_type': r_type,
            'pk': int(pk),
            'sample_id': int(sample_id),
            'alignment': alignment
        })
    return tasks
Esempio n. 21
0
def process_sample(session, sample, indexes, temp, v_germlines, j_germlines,
                   nproc):
    indels = session.query(
        Sequence.ai, Sequence.seq_id,
        Sequence.sample_id, Sequence.sequence).filter(
            Sequence.sample_id == sample.id,
            Sequence.probable_indel_or_misalign == 1).order_by(Sequence.seq_id)
    # Get the sequences that were not identifiable
    noresults = session.query(NoResult).filter(
        NoResult.sample_id == sample.id).order_by(NoResult.seq_id)

    if indels.count() == 0 and noresults.count() == 0:
        logger.info('Sample {} has no indels or noresults'.format(sample.id))
        return
    logger.info('Sample {} has {} indels and {} noresults'.format(
        sample.id, indels.count(), noresults.count()))

    mut_bucket = v_germlines.mut_bucket(sample.v_ties_mutations)
    len_bucket = v_germlines.length_bucket(sample.v_ties_len)
    bucket = '{}_{}'.format(str(mut_bucket).replace('.', ''), len_bucket)
    sample_v_germlines = get_formatted_ties(
        v_germlines.all_ties(sample.v_ties_len, sample.v_ties_mutations))
    sample_j_germlines = get_formatted_ties(
        j_germlines.all_ties(sample.v_ties_len, sample.v_ties_mutations))
    if bucket not in indexes:
        indexes.add(bucket)
        v_path = os.path.join(temp, 'v_genes_{}'.format(bucket))
        j_path = os.path.join(temp, 'j_genes_{}'.format(bucket))
        logger.info('Creating index for V-ties at {} length, {} '
                    'mutation'.format(len_bucket, mut_bucket))
        build_index(sample_v_germlines, v_path)
        build_index(sample_j_germlines, j_path)

    seq_path = os.path.join(temp, 'll_{}.fasta'.format(sample.id))
    with open(seq_path, 'w+') as fh:
        fh.write(
            get_fasta({
                'tp=Sequence|ai={}|sample_id={}|seq_id={}'.format(
                    r.ai, r.sample_id, r.seq_id): r.sequence
                for r in indels
            }))
        fh.write(
            get_fasta({
                'tp=NoResult|pk={}|sample_id={}|seq_id={}'.format(
                    r.pk, r.sample_id, r.seq_id): r.sequence
                for r in noresults
            }))

    alignments = {}
    logger.info('Running bowtie2 for V-gene sequences')
    for line in get_reader(
            align_reference(temp, 'v_genes_{}'.format(bucket), seq_path,
                            nproc)):
        line['ref_offset'] = int(line['ref_offset']) - 1
        ref_gene = line['reference']
        try:
            ref, seq, rem_seqs = create_seqs(
                ref_seq=sample_v_germlines[ref_gene].replace('-', ''),
                min_size=CDR3_OFFSET,
                **line)
        except KeyError as e:
            logger.warning('bowtie got invalid V: ' + str(e))
            continue
        if len(rem_seqs) == 0:
            continue

        ref, seq, seq_start = add_imgt_gaps(sample_v_germlines[ref_gene], ref,
                                            seq, line['ref_offset'])
        if len(ref) < CDR3_OFFSET:
            continue
        alignments[line['seq_id']] = {
            'v_germline': ref,
            'v_gene': line['reference'],
            'seq_start': seq_start,
            'v_sequence': seq,
            'v_rem_seq': rem_seqs[-1],
            'cdr3_start': len(ref)
        }

    seq_path = os.path.join(temp, 'll_j_{}.fasta'.format(sample.id))
    with open(seq_path, 'w+') as fh:
        seqs = {
            k: v['v_rem_seq']
            for k, v in alignments.items() if len(v['v_rem_seq']) > 0
        }
        fh.write(get_fasta(seqs))

    tasks = []
    logger.info('Running bowtie2 for J-gene sequences')
    for line in get_reader(
            align_reference(temp, 'j_genes_{}'.format(bucket), seq_path,
                            nproc)):
        line['ref_offset'] = int(line['ref_offset']) - 1
        ref_gene = line['reference']
        ref, seq, rem_seqs = create_seqs(
            ref_seq=sample_j_germlines[ref_gene].replace('-', ''),
            min_size=j_germlines.upstream_of_cdr3,
            **line)
        alignments[line['seq_id']]['j_gene'] = line['reference']

        full_seq = (alignments[line['seq_id']]['v_sequence'] +
                    alignments[line['seq_id']]['v_rem_seq'])
        if len(rem_seqs) > 0:
            full_seq = full_seq[:-len(rem_seqs[-1])]

        cdr3_end = len(full_seq)
        if len(ref) < j_germlines.upstream_of_cdr3:
            continue
        for i in range(j_germlines.upstream_of_cdr3):
            if ref[-i] != '-':
                cdr3_end -= 1
        alignments[line['seq_id']]['cdr3_end'] = cdr3_end

        cdr3_length = cdr3_end - alignments[line['seq_id']]['cdr3_start']

        full_germ = (alignments[line['seq_id']]['v_germline'] +
                     (GAP_PLACEHOLDER * cdr3_length))
        j_length = len(full_seq) - len(full_germ)
        if j_length <= 0 or cdr3_length <= 0:
            continue
        full_germ += ref[-j_length:]

        r_type, pk, sample_id, seq_id = [
            v.split('=', 1)[1] for v in line['seq_id'].split('|', 3)
        ]
        insertions = gap_positions(full_germ)
        deletions = gap_positions(full_seq)

        alignment = VDJAlignment(
            VDJSequence(seq_id, full_seq.replace(GAP_PLACEHOLDER, '-')))
        alignment.germline = full_germ.replace(GAP_PLACEHOLDER, '-')
        if len(alignment.germline) != len(alignment.sequence.sequence):
            continue
        alignment.v_gene.add(GeneName(alignments[line['seq_id']]['v_gene']))
        alignment.j_gene.add(GeneName(alignments[line['seq_id']]['j_gene']))
        alignment.seq_offset = alignments[line['seq_id']]['seq_start']
        # TODO: This should really look for a streak like in anchoring
        alignment.germline_cdr3 = '-' * cdr3_length
        gaps_in_seq = alignment.sequence.sequence[
            alignment.
            seq_start:alignments[line['seq_id']]['cdr3_start']].count('-')
        alignment.v_length = (alignments[line['seq_id']]['cdr3_start'] -
                              alignment.seq_offset) - gaps_in_seq
        alignment.j_length = j_length
        alignment.v_mutation_fraction = 1 - (alignment.v_match /
                                             alignment.v_length)
        alignment.cdr3_start = alignments[line['seq_id']]['cdr3_start']
        alignment.cdr3_num_nts = cdr3_length
        alignment.post_cdr3_length = j_length
        alignment.insertions = insertions
        alignment.deletions = deletions
        alignment.locally_aligned = True

        tasks.append({
            'r_type': r_type,
            'pk': int(pk),
            'sample_id': int(sample_id),
            'alignment': alignment
        })
    return tasks
Esempio n. 22
0
 def cleanup(self):
     for fn in self.files:
         try:
             os.remove(fn)
         except OSError as e:
             logger.warning('Could not remove {}: {}'.format(fn, e))
Esempio n. 23
0
def add_noresults_for_vdj(session, vdj, sample, reason):
    try:
        session.add(get_noresult_from_vdj(session, vdj, sample, reason))
    except ValueError:
        logger.warning('Unable to add noresult')
Esempio n. 24
0
 def shutdown():
     if allow_shutdown:
         logger.warning('Shutting down from remote request')
         os.kill(os.getppid(), signal.SIGINT)
     return create_response(code=404)
Esempio n. 25
0
def update_metadata(session, args):
    SENTINEL = '__TEMP'  # Used to temporarily avoid duplicate name issues
    IGNORE_FIELDS = ['id', 'name', 'subject']
    with open(args.new_metadata) as fh:
        reader = csv.DictReader(fh, delimiter='\t')
        new_meta = {int(l['id']): l for l in reader}

    session.query(SampleMetadata).filter(
        SampleMetadata.sample_id.in_(
            new_meta.keys())).delete(synchronize_session='fetch')

    old_subjects = set()
    for sample_id, row in new_meta.items():
        logger.info('Updating metadata for #{id}: {name}'.format(**row))
        sample = session.query(Sample).get(sample_id)
        # Update subject
        if sample.subject.identifier != row['subject']:
            logger.info('Subject for sample "{}" changed from {} -> {}'.format(
                sample.name, sample.subject.identifier, row['subject']))
            old_subjects.add(sample.subject)
            new_subject = session.query(Subject).filter(
                Subject.study == sample.study,
                Subject.identifier == row['subject']).first()
            if not new_subject:
                new_subject = Subject(study=sample.study,
                                      identifier=row['subject'])
                session.add(new_subject)
                session.flush()
                logger.info('\tNew subject found')
            else:
                old_subjects.add(new_subject)

            sample.subject = new_subject
            assert new_subject.id is not None
            session.query(Sequence).filter(Sequence.sample == sample).update(
                {'subject_id': new_subject.id})

        for subject in session.query(Subject):
            if not subject.samples:
                logger.info('Deleting orphan subject "{}"'.format(
                    subject.identifier))
                session.delete(subject)
            elif subject in old_subjects:
                logger.info('Resetting subject "{}"'.format(
                    subject.identifier))
                subject.reset()

        # Update metadata
        session.add_all([
            SampleMetadata(sample=sample, key=k, value=v)
            for k, v in row.items()
            if k not in IGNORE_FIELDS and v not in NA_VALUES
        ])
        # Update name
        sample.name = row['name'] + SENTINEL

    session.commit()

    for sample in session.query(Sample).filter(Sample.name.like('%' +
                                                                SENTINEL)):
        sample.name = sample.name[:-len(SENTINEL)]

    if session.query(Clone.id).filter(~Clone.tree.is_(None)).count() > 0:
        logger.warning('This database has at least one clonal lineage '
                       'constructed.  All lineages will need to be updated '
                       'to reflect the modified metadata.')
    session.commit()
 def shutdown():
     if allow_shutdown:
         logger.warning('Shutting down from remote request')
         os.kill(os.getppid(), signal.SIGINT)
     return create_response(code=404)