def read_file(session, fmt, handle, sample, v_germlines, j_germlines, props):
    reader = csv.DictReader(handle, delimiter='\t')
    uniques = {}

    for i, line in enumerate(reader):
        if fmt == 'adaptive':
            try:
                line = extract_adaptive_sequence(i, line, v_germlines,
                                                 j_germlines)
            except (AlignmentException, KeyError) as e:
                seq = VDJSequence('seq_{}'.format(i), '')
                add_noresults_for_vdj(session, seq, sample, str(e))
                continue
        seq = VDJSequence(line['SEQUENCE_ID'],
                          line['SEQUENCE_IMGT'].replace('.', '-'))
        if 'DUPCOUNT' in line:
            seq.copy_number = int(line['DUPCOUNT'])
        try:
            alignment = create_alignment(seq, line, v_germlines, j_germlines)
            for other in uniques.setdefault(
                    len(alignment.sequence.sequence), []):
                if dnautils.equal(other.sequence.sequence,
                                  alignment.sequence.sequence):
                    other.sequence.copy_number += (
                        alignment.sequence.copy_number)
                    break
            else:
                uniques[len(alignment.sequence.sequence)].append(alignment)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    uniques = [s for k in sorted(uniques.keys()) for s in uniques[k]]
    lens = []
    muts = []
    for unique in uniques:
        try:
            props.validate(unique)
            add_sequences(session, [unique], sample)
            lens.append(unique.v_length)
            muts.append(unique.v_mutation_fraction)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    if len(lens) > 0:
        sample.v_ties_len = sum(lens) / len(lens)
        sample.v_ties_mutations = sum(muts) / len(muts)

    session.commit()
Esempio n. 2
0
def read_file(session, fmt, handle, sample, v_germlines, j_germlines, props):
    reader = csv.DictReader(handle, delimiter='\t')
    uniques = {}

    for i, line in enumerate(reader):
        if fmt == 'adaptive':
            try:
                line = extract_adaptive_sequence(i, line, v_germlines,
                                                 j_germlines)
            except (AlignmentException, KeyError) as e:
                seq = VDJSequence('seq_{}'.format(i), '')
                add_noresults_for_vdj(session, seq, sample, str(e))
                continue
        seq = VDJSequence(line['SEQUENCE_ID'],
                          line['SEQUENCE_IMGT'].replace('.', '-'))
        if 'DUPCOUNT' in line:
            seq.copy_number = int(line['DUPCOUNT'])
        try:
            alignment = create_alignment(seq, line, v_germlines, j_germlines)
            for other in uniques.setdefault(len(alignment.sequence.sequence),
                                            []):
                if dnautils.equal(other.sequence.sequence,
                                  alignment.sequence.sequence):
                    other.sequence.copy_number += (
                        alignment.sequence.copy_number)
                    break
            else:
                uniques[len(alignment.sequence.sequence)].append(alignment)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    uniques = [s for k in sorted(uniques.keys()) for s in uniques[k]]
    lens = []
    muts = []
    for unique in uniques:
        try:
            props.validate(unique)
            add_sequences(session, [unique], sample)
            lens.append(unique.v_length)
            muts.append(unique.v_mutation_fraction)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    if len(lens) > 0:
        sample.v_ties_len = sum(lens) / float(len(lens))
        sample.v_ties_mutations = sum(muts) / float(len(muts))

    session.commit()
Esempio n. 3
0
def read_input(path):
    vdjs = []
    parser = SeqIO.parse(path, 'fasta' if path.endswith('.fasta') else 'fastq')

    # Collapse identical sequences
    logger.info('Parsing input')
    for record in parser:
        try:
            vdjs.append(
                VDJSequence(
                    seq_id=record.description,
                    sequence=str(record.seq),
                    quality=funcs.ord_to_quality(
                        record.letter_annotations.get('phred_quality'))))
        except ValueError:
            continue

    logger.info('There are {} sequences'.format(len(vdjs)))
    return vdjs
Esempio n. 4
0
def process_sample(session, sample, indexes, temp, v_germlines, j_germlines,
                   nproc):
    indels = session.query(Sequence.ai, Sequence.seq_id, Sequence.sample_id,
                           Sequence.sequence).filter(
                               Sequence.sample_id == sample.id,
                               Sequence.probable_indel_or_misalign == 1)
    # Get the sequences that were not identifiable
    noresults = session.query(NoResult).filter(NoResult.sample_id == sample.id)

    if indels.count() == 0 and noresults.count() == 0:
        logger.info('Sample {} has no indels or noresults'.format(sample.id))
        return
    logger.info('Sample {} has {} indels and {} noresults'.format(
        sample.id, indels.count(), noresults.count()))

    mut_bucket = v_germlines.mut_bucket(sample.v_ties_mutations)
    len_bucket = v_germlines.length_bucket(sample.v_ties_len)
    bucket = '{}_{}'.format(str(mut_bucket).replace('.', ''), len_bucket)
    sample_v_germlines = get_formatted_ties(
        v_germlines.all_ties(sample.v_ties_len, sample.v_ties_mutations))
    sample_j_germlines = get_formatted_ties(
        j_germlines.all_ties(sample.v_ties_len, sample.v_ties_mutations))
    if bucket not in indexes:
        indexes.add(bucket)
        v_path = os.path.join(temp, 'v_genes_{}'.format(bucket))
        j_path = os.path.join(temp, 'j_genes_{}'.format(bucket))
        logger.info('Creating index for V-ties at {} length, {} '
                    'mutation'.format(len_bucket, mut_bucket))
        build_index(sample_v_germlines, v_path)
        build_index(sample_j_germlines, j_path)

    seq_path = os.path.join(temp, 'll_{}.fasta'.format(sample.id))
    with open(seq_path, 'w+') as fh:
        fh.write(
            get_fasta({
                'tp=Sequence|ai={}|sample_id={}|seq_id={}'.format(
                    r.ai, r.sample_id, r.seq_id): r.sequence
                for r in indels
            }))
        fh.write(
            get_fasta({
                'tp=NoResult|pk={}|sample_id={}|seq_id={}'.format(
                    r.pk, r.sample_id, r.seq_id): r.sequence
                for r in noresults
            }))

    alignments = {}
    logger.info('Running bowtie2 for V-gene sequences')
    for line in get_reader(
            align_reference(temp, 'v_genes_{}'.format(bucket), seq_path,
                            nproc)):
        line['ref_offset'] = int(line['ref_offset']) - 1
        ref_gene = line['reference']
        ref, seq, rem_seqs = create_seqs(
            ref_seq=sample_v_germlines[ref_gene].replace('-', ''),
            min_size=CDR3_OFFSET,
            **line)
        if len(rem_seqs) == 0:
            continue

        ref, seq, seq_start = add_imgt_gaps(sample_v_germlines[ref_gene], ref,
                                            seq, line['ref_offset'])
        if len(ref) < CDR3_OFFSET:
            continue
        alignments[line['seq_id']] = {
            'v_germline': ref,
            'v_gene': line['reference'],
            'seq_start': seq_start,
            'v_sequence': seq,
            'v_rem_seq': rem_seqs[-1],
            'cdr3_start': len(ref)
        }

    seq_path = os.path.join(temp, 'll_j_{}.fasta'.format(sample.id))
    with open(seq_path, 'w+') as fh:
        seqs = {
            k: v['v_rem_seq']
            for k, v in alignments.iteritems() if len(v['v_rem_seq']) > 0
        }
        fh.write(get_fasta(seqs))

    tasks = []
    logger.info('Running bowtie2 for J-gene sequences')
    for line in get_reader(
            align_reference(temp, 'j_genes_{}'.format(bucket), seq_path,
                            nproc)):
        line['ref_offset'] = int(line['ref_offset']) - 1
        ref_gene = line['reference']
        ref, seq, rem_seqs = create_seqs(
            ref_seq=sample_j_germlines[ref_gene].replace('-', ''),
            min_size=j_germlines.upstream_of_cdr3,
            **line)
        alignments[line['seq_id']]['j_gene'] = line['reference']

        full_seq = (alignments[line['seq_id']]['v_sequence'] +
                    alignments[line['seq_id']]['v_rem_seq'])
        if len(rem_seqs) > 0:
            full_seq = full_seq[:-len(rem_seqs[-1])]

        cdr3_end = len(full_seq)
        if len(ref) < j_germlines.upstream_of_cdr3:
            continue
        for i in range(j_germlines.upstream_of_cdr3):
            if ref[-i] != '-':
                cdr3_end -= 1
        alignments[line['seq_id']]['cdr3_end'] = cdr3_end

        cdr3_length = cdr3_end - alignments[line['seq_id']]['cdr3_start']

        full_germ = (alignments[line['seq_id']]['v_germline'] +
                     (GAP_PLACEHOLDER * cdr3_length))
        j_length = len(full_seq) - len(full_germ)
        if j_length <= 0 or cdr3_length <= 0:
            continue
        full_germ += ref[-j_length:]

        r_type, pk, sample_id, seq_id = [
            v.split('=', 1)[1] for v in line['seq_id'].split('|', 3)
        ]
        insertions = gap_positions(full_germ)
        deletions = gap_positions(full_seq)

        alignment = VDJAlignment(
            VDJSequence(seq_id, full_seq.replace(GAP_PLACEHOLDER, '-')))
        alignment.germline = full_germ.replace(GAP_PLACEHOLDER, '-')
        if len(alignment.germline) != len(alignment.sequence.sequence):
            continue
        alignment.v_gene.add(GeneName(alignments[line['seq_id']]['v_gene']))
        alignment.j_gene.add(GeneName(alignments[line['seq_id']]['j_gene']))
        alignment.seq_offset = alignments[line['seq_id']]['seq_start']
        # TODO: This should really look for a streak like in anchoring
        alignment.germline_cdr3 = '-' * cdr3_length
        gaps_in_seq = alignment.sequence.sequence[
            alignment.
            seq_start:alignments[line['seq_id']]['cdr3_start']].count('-')
        alignment.v_length = (alignments[line['seq_id']]['cdr3_start'] -
                              alignment.seq_offset) - gaps_in_seq
        alignment.j_length = j_length
        alignment.v_mutation_fraction = 1 - (alignment.v_match /
                                             float(alignment.v_length))
        alignment.cdr3_start = alignments[line['seq_id']]['cdr3_start']
        alignment.cdr3_num_nts = cdr3_length
        alignment.post_cdr3_length = j_length
        alignment.insertions = insertions
        alignment.deletions = deletions
        alignment.locally_aligned = True

        tasks.append({
            'r_type': r_type,
            'pk': int(pk),
            'sample_id': int(sample_id),
            'alignment': alignment
        })
    return tasks
Esempio n. 5
0
def parse_airr(line, v_germlines, j_germlines):
    seq = VDJSequence(
        seq_id=line['sequence_id'].replace('reversed|', ''),
        sequence=line['sequence_alignment'],
        rev_comp=line['rev_comp'] == 'T',
    )
    if not all([line['v_call'], line['j_call'], line['junction_aa']]):
        raise AlignmentException(seq, 'Missing v_gene, j_gene, or junction_aa')

    seq.pad(int(line['v_germline_start']) - 1)
    try:
        v_germ_seq = v_germlines.get_ties(line['v_call'].split(','))
    except KeyError:
        raise AlignmentException(
            seq,
            'V-gene {} not in germline database'.format(line['v_call'])
        )

    aligned_germ = ''.join([
        v_germ_seq.replace('-', '')[:int(line['v_germline_start']) - 1],
        line['germline_alignment']
    ])
    # Append the missing portion, if any, of the J to the germline
    j_germ_seq = j_germlines.get_ties(line['j_call'].split(','))
    append_j = len(j_germ_seq) - int(line['j_germline_end'])
    if append_j > 0:
        aligned_germ += j_germ_seq[-append_j:]
        seq.pad_right(append_j)

    aligned_seq, gaps_added = add_imgt_gaps(v_germ_seq, seq)
    aligned_germ = add_imgt_gaps(
        v_germ_seq, VDJSequence('', aligned_germ)
    )[0].sequence
    cdr3_start = int(line['cdr3_start']) - int(line['v_sequence_start'])
    # Push the start of the CDR3 based on number of IMGT gaps added.  Then add
    # 3 because IgBLAST's CDR3 excludes the preserved Cysteine
    cdr3_start += gaps_added - 3
    cdr3_start += aligned_seq.sequence[:cdr3_start].count('-')
    cdr3_start += int(line['v_germline_start']) - 1
    cdr3_end = cdr3_start + len(line['cdr3']) + 6
    # If there is an insertion in the CDR3 but not junction, increase CDR3
    # length
    junction_insertions = aligned_germ[cdr3_end - 3:cdr3_end].count('-')
    cdr3_end += junction_insertions
    cdr3_seq = aligned_seq.sequence[cdr3_start:cdr3_end]

    germline_cdr3 = aligned_germ[cdr3_start:cdr3_end]
    aligned_germ = ''.join([
        aligned_germ[:cdr3_start],
        '.' * (cdr3_end - cdr3_start),
        aligned_germ[cdr3_end:]
    ])
    aligned_seq = ''.join([
        aligned_seq.sequence[:cdr3_start],
        cdr3_seq,
        aligned_seq.sequence[cdr3_end:]
    ])

    total_insertions = line['v_germline_alignment'].count('-')
    correct_cdr3_start = CDR3_OFFSET + total_insertions
    if cdr3_start != correct_cdr3_start:
        raise AlignmentException(
            seq, 'CDR3 starts at {} instead of {} ({} insertions)'.format(
                cdr3_start, correct_cdr3_start, total_insertions))

    alignment = funcs.ClassProxy(VDJAlignment(
        VDJSequence(line['sequence_id'], aligned_seq.replace('.', '-'))
    ))
    alignment.germline = aligned_germ.replace('.', '-')
    alignment.v_gene = set([GeneName(c) for c in line['v_call'].split(',')])
    alignment.j_gene = set([GeneName(c) for c in line['j_call'].split(',')])
    alignment.cdr3_start = cdr3_start
    alignment.cdr3_num_nts = len(cdr3_seq)
    alignment.locally_aligned = True
    alignment.germline_cdr3 = germline_cdr3
    alignment.seq_offset = int(line['v_germline_start']) - 1
    alignment.v_length = int(line['v_alignment_end'])
    alignment.j_length = (int(line['j_alignment_end']) -
                          int(line['j_alignment_start']))
    alignment.v_mutation_fraction = (100 - float(line['v_identity'])) / 100
    # Skipping the germline_cdr3 field and instead populating its dependencies
    # via the proxy
    alignment.j_match = float(line['j_identity']) * alignment.j_length / 100
    alignment.post_cdr3_length = len(alignment.sequence.sequence) - cdr3_end
    alignment.insertions = funcs.gap_positions(aligned_germ)
    alignment.deletions = funcs.gap_positions(aligned_seq)

    return alignment
Esempio n. 6
0
def read_file(session, handle, sample, v_germlines, j_germlines, columns,
              remaps):
    seqs = _collapse_seqs(session, sample,
                          csv.DictReader(handle, delimiter='\t'), columns)

    aligned_seqs = {}
    missed = 0
    total = 0
    for total, seq in enumerate(seqs):
        if total > 0 and total % 1000 == 0:
            logger.info('Finished {}'.format(total))
            session.commit()

        orig_v_genes = set(
            re.findall('IGHV[^ ,]+', seq['record'][columns.v_gene]))
        orig_j_genes = set(
            re.findall('IGHJ[^ ,]+', seq['record'][columns.j_gene]))
        if remaps is not None:
            remapped_j_genes = set([])
            for j in orig_j_genes:
                for remap_from, remap_to in remaps.iteritems():
                    if j.startswith(remap_from):
                        remapped_j_genes.add(remap_to)
                        break
                else:
                    remapped_j_genes.add(j)
            orig_j_genes = remapped_j_genes

        v_genes = filter(lambda v: v in v_germlines, orig_v_genes)
        j_genes = filter(lambda j: j in j_germlines, orig_j_genes)

        vdj = VDJSequence(seq['seq_ids'],
                          seq['record'][columns.full_sequence],
                          v_germlines,
                          j_germlines,
                          force_vs=v_genes,
                          force_js=j_genes)
        try:
            if len(v_genes) == 0:
                raise AlignmentException('No valid V germline for {}'.format(
                    ','.join(sorted(orig_v_genes))))
            if len(j_genes) == 0:
                raise AlignmentException('No valid J germline for {}'.format(
                    ','.join(sorted(orig_j_genes))))
            vdj.analyze()

            if vdj.sequence in aligned_seqs:
                aligned_seqs[vdj.sequence].ids += vdj.ids
            else:
                aligned_seqs[vdj.sequence] = vdj
        except AlignmentException as e:
            add_as_noresult(session, vdj, sample, str(e))
            missed += 1
    logger.info('Aligned {} / {} sequences'.format(total - missed + 1, total))

    logger.info('Collapsing ambiguous character sequences')
    if len(aligned_seqs) > 0:
        avg_mut = sum([v.mutation_fraction for v in aligned_seqs.values()
                       ]) / float(len(aligned_seqs))
        avg_len = sum([v.v_length for v in aligned_seqs.values()]) / float(
            len(aligned_seqs))
        sample.v_ties_mutations = avg_mut
        sample.v_ties_len = avg_len
        if columns.ties:
            add_uniques(session,
                        sample,
                        aligned_seqs.values(),
                        realign_mut=avg_mut,
                        realign_len=avg_len,
                        trim_to=columns.trim_to,
                        max_padding=columns.max_padding)
        else:
            add_uniques(session, sample, aligned_seqs.values())
    session.commit()
Esempio n. 7
0
    def do_task(self, args):
        meta = args['meta']
        self.info('Starting sample {}'.format(meta['sample_name']))
        study, sample = self._setup_sample(meta)

        vdjs = {}
        parser = SeqIO.parse(
            args['path'],
            'fasta' if args['path'].endswith('.fasta') else 'fastq')

        # Collapse identical sequences
        self.info('\tCollapsing identical sequences')
        for record in parser:
            try:
                seq = str(record.seq)
                if seq not in vdjs:
                    vdjs[seq] = VDJSequence(
                        ids=[],
                        sequence=seq,
                        quality=funcs.ord_to_quality(
                            record.letter_annotations.get('phred_quality')))
                vdjs[seq].ids.append(record.description)
            except ValueError:
                continue

        alignments = {}
        aligner = AnchorAligner(self._v_germlines, self._j_germlines)
        self.info('\tAligning {} unique sequences'.format(len(vdjs)))
        # Attempt to align all unique sequences
        for sequence in funcs.periodic_commit(self._session,
                                              sorted(vdjs.keys())):
            vdj = vdjs[sequence]
            del vdjs[sequence]
            try:
                # The alignment was successful.  If the aligned sequence
                # already exists, append the seq_ids.  Otherwise add it as a
                # new unique sequence.
                alignment = aligner.get_alignment(vdj)
                seq_key = alignment.sequence.sequence
                if seq_key in alignments:
                    alignments[seq_key].sequence.ids.extend(
                        alignment.sequence.ids)
                else:
                    alignments[seq_key] = alignment
            except AlignmentException as e:
                add_as_noresult(self._session, vdj, sample, str(e))
            except Exception:
                self.error(
                    '\tUnexpected error processing sequence {}\n\t{}'.format(
                        vdj.ids[0], traceback.format_exc()))
        if len(alignments) > 0:
            avg_len = (sum([v.v_length for v in alignments.values()]) /
                       float(len(alignments)))
            avg_mut = (
                sum([v.v_mutation_fraction
                     for v in alignments.values()]) / float(len(alignments)))
            sample.v_ties_mutations = avg_mut
            sample.v_ties_len = avg_len

            self.info('\tRe-aligning {} sequences to V-ties, Mutations={}, '
                      'Length={}'.format(len(alignments), round(avg_mut, 2),
                                         round(avg_len, 2)))
            add_uniques(self._session, sample, alignments.values(),
                        self._props, aligner, avg_len, avg_mut)

        self._session.commit()
        self.info('Completed sample {}'.format(sample.name))
Esempio n. 8
0
    def do_task(self, args):
        meta = args['meta']
        self.info('Starting sample {}'.format(meta.get('sample_name')))
        study, sample = self._setup_sample(meta)

        vdjs = {}
        parser = SeqIO.parse(
            os.path.join(args['path'], args['fn']),
            'fasta' if args['fn'].endswith('.fasta') else 'fastq')

        # Collapse identical sequences
        self.info('\tCollapsing identical sequences')
        for record in parser:
            seq = str(record.seq)
            if seq not in vdjs:
                vdjs[seq] = VDJSequence(
                    ids=[],
                    seq=seq,
                    v_germlines=self._v_germlines,
                    j_germlines=self._j_germlines,
                    quality=funcs.ord_to_quality(
                        record.letter_annotations.get('phred_quality')))
            vdjs[seq].ids.append(record.description)

        self.info('\tAligning {} unique sequences'.format(len(vdjs)))
        # Attempt to align all unique sequences
        for sequence in funcs.periodic_commit(self._session,
                                              sorted(vdjs.keys())):
            vdj = vdjs[sequence]
            del vdjs[sequence]
            try:
                # The alignment was successful.  If the aligned sequence
                # already exists, append the seq_ids.  Otherwise add it as a
                # new unique sequence.
                vdj.analyze()
                if vdj.sequence in vdjs:
                    vdjs[vdj.sequence].ids += vdj.ids
                else:
                    vdjs[vdj.sequence] = vdj
            except AlignmentException as e:
                add_as_noresult(self._session, vdj, sample, str(e))
            except:
                self.error(
                    '\tUnexpected error processing sequence {}\n\t{}'.format(
                        vdj.ids[0], traceback.format_exc()))
        if len(vdjs) > 0:
            avg_len = sum(map(lambda vdj: vdj.v_length,
                              vdjs.values())) / float(len(vdjs))
            avg_mut = sum(map(lambda vdj: vdj.mutation_fraction,
                              vdjs.values())) / float(len(vdjs))
            sample.v_ties_mutations = avg_mut
            sample.v_ties_len = avg_len

            self.info('\tRe-aligning {} sequences to V-ties, Mutations={}, '
                      'Length={}'.format(len(vdjs), round(avg_mut, 2),
                                         round(avg_len, 2)))
            add_uniques(self._session, sample, vdjs.values(), avg_len, avg_mut,
                        self._min_similarity, self._max_vties, self._trim_to,
                        self._max_padding)

        self._session.commit()
        self.info('Completed sample {}'.format(sample.name))