Esempio n. 1
0
def read_input(path):
    vdjs = []
    parser = SeqIO.parse(path, 'fasta' if path.endswith('.fasta') else 'fastq')

    # Collapse identical sequences
    logger.info('Parsing input')
    for record in parser:
        try:
            vdjs.append(
                VDJSequence(
                    seq_id=record.description,
                    sequence=str(record.seq),
                    quality=funcs.ord_to_quality(
                        record.letter_annotations.get('phred_quality'))))
        except ValueError:
            continue

    logger.info('There are {} sequences'.format(len(vdjs)))
    return vdjs
def read_input(path):
    vdjs = []
    parser = SeqIO.parse(path, 'fasta' if path.endswith('.fasta') else 'fastq')

    # Collapse identical sequences
    logger.info('Parsing input')
    for record in parser:
        try:
            vdjs.append(VDJSequence(
                seq_id=record.description,
                sequence=str(record.seq),
                quality=funcs.ord_to_quality(
                    record.letter_annotations.get('phred_quality')
                )
            ))
        except ValueError:
            continue

    logger.info('There are {} sequences'.format(len(vdjs)))
    return vdjs
Esempio n. 3
0
    def do_task(self, args):
        meta = args['meta']
        self.info('Starting sample {}'.format(meta['sample_name']))
        study, sample = self._setup_sample(meta)

        vdjs = {}
        parser = SeqIO.parse(
            args['path'],
            'fasta' if args['path'].endswith('.fasta') else 'fastq')

        # Collapse identical sequences
        self.info('\tCollapsing identical sequences')
        for record in parser:
            try:
                seq = str(record.seq)
                if seq not in vdjs:
                    vdjs[seq] = VDJSequence(
                        ids=[],
                        sequence=seq,
                        quality=funcs.ord_to_quality(
                            record.letter_annotations.get('phred_quality')))
                vdjs[seq].ids.append(record.description)
            except ValueError:
                continue

        alignments = {}
        aligner = AnchorAligner(self._v_germlines, self._j_germlines)
        self.info('\tAligning {} unique sequences'.format(len(vdjs)))
        # Attempt to align all unique sequences
        for sequence in funcs.periodic_commit(self._session,
                                              sorted(vdjs.keys())):
            vdj = vdjs[sequence]
            del vdjs[sequence]
            try:
                # The alignment was successful.  If the aligned sequence
                # already exists, append the seq_ids.  Otherwise add it as a
                # new unique sequence.
                alignment = aligner.get_alignment(vdj)
                seq_key = alignment.sequence.sequence
                if seq_key in alignments:
                    alignments[seq_key].sequence.ids.extend(
                        alignment.sequence.ids)
                else:
                    alignments[seq_key] = alignment
            except AlignmentException as e:
                add_as_noresult(self._session, vdj, sample, str(e))
            except Exception:
                self.error(
                    '\tUnexpected error processing sequence {}\n\t{}'.format(
                        vdj.ids[0], traceback.format_exc()))
        if len(alignments) > 0:
            avg_len = (sum([v.v_length for v in alignments.values()]) /
                       float(len(alignments)))
            avg_mut = (
                sum([v.v_mutation_fraction
                     for v in alignments.values()]) / float(len(alignments)))
            sample.v_ties_mutations = avg_mut
            sample.v_ties_len = avg_len

            self.info('\tRe-aligning {} sequences to V-ties, Mutations={}, '
                      'Length={}'.format(len(alignments), round(avg_mut, 2),
                                         round(avg_len, 2)))
            add_uniques(self._session, sample, alignments.values(),
                        self._props, aligner, avg_len, avg_mut)

        self._session.commit()
        self.info('Completed sample {}'.format(sample.name))
Esempio n. 4
0
    def do_task(self, args):
        meta = args['meta']
        self.info('Starting sample {}'.format(meta.get('sample_name')))
        study, sample = self._setup_sample(meta)

        vdjs = {}
        parser = SeqIO.parse(
            os.path.join(args['path'], args['fn']),
            'fasta' if args['fn'].endswith('.fasta') else 'fastq')

        # Collapse identical sequences
        self.info('\tCollapsing identical sequences')
        for record in parser:
            seq = str(record.seq)
            if seq not in vdjs:
                vdjs[seq] = VDJSequence(
                    ids=[],
                    seq=seq,
                    v_germlines=self._v_germlines,
                    j_germlines=self._j_germlines,
                    quality=funcs.ord_to_quality(
                        record.letter_annotations.get('phred_quality')))
            vdjs[seq].ids.append(record.description)

        self.info('\tAligning {} unique sequences'.format(len(vdjs)))
        # Attempt to align all unique sequences
        for sequence in funcs.periodic_commit(self._session,
                                              sorted(vdjs.keys())):
            vdj = vdjs[sequence]
            del vdjs[sequence]
            try:
                # The alignment was successful.  If the aligned sequence
                # already exists, append the seq_ids.  Otherwise add it as a
                # new unique sequence.
                vdj.analyze()
                if vdj.sequence in vdjs:
                    vdjs[vdj.sequence].ids += vdj.ids
                else:
                    vdjs[vdj.sequence] = vdj
            except AlignmentException as e:
                add_as_noresult(self._session, vdj, sample, str(e))
            except:
                self.error(
                    '\tUnexpected error processing sequence {}\n\t{}'.format(
                        vdj.ids[0], traceback.format_exc()))
        if len(vdjs) > 0:
            avg_len = sum(map(lambda vdj: vdj.v_length,
                              vdjs.values())) / float(len(vdjs))
            avg_mut = sum(map(lambda vdj: vdj.mutation_fraction,
                              vdjs.values())) / float(len(vdjs))
            sample.v_ties_mutations = avg_mut
            sample.v_ties_len = avg_len

            self.info('\tRe-aligning {} sequences to V-ties, Mutations={}, '
                      'Length={}'.format(len(vdjs), round(avg_mut, 2),
                                         round(avg_len, 2)))
            add_uniques(self._session, sample, vdjs.values(), avg_len, avg_mut,
                        self._min_similarity, self._max_vties, self._trim_to,
                        self._max_padding)

        self._session.commit()
        self.info('Completed sample {}'.format(sample.name))