Ejemplo n.º 1
0
def import_template(session, in_file, regen):
    if regen:
        session.query(Clone).delete()
        session.query(CloneStats).delete()
        session.query(SampleStats).delete()
        session.commit()

    seen_clones = {}
    with open(in_file) as fh:
        reader = csv.DictReader(fh, delimiter='\t')
        if ('ai' not in reader.fieldnames
                or 'clone_id' not in reader.fieldnames):
            raise ImportException(
                'Input file must have "ai" and "clone_id" fields')
        for line in reader:
            if line['clone_id'] in ('None', '0', ''):
                continue
            clone_info = seen_clones.setdefault(
                line['clone_id'], {
                    'clone':
                    Clone(subject_id=int(line['subject_id']),
                          v_gene=line['v_gene'],
                          j_gene=line['j_gene'],
                          cdr3_num_nts=int(line['cdr3_num_nts'])),
                    'seqs': []
                })
            clone_inst = clone_info['clone']
            if (clone_inst.v_gene != line['v_gene']
                    or clone_inst.j_gene != line['j_gene']
                    or clone_inst.cdr3_num_nts != int(line['cdr3_num_nts'])
                    or clone_inst.subject_id != int(line['subject_id'])):
                raise ImportException(
                    'Sequence with ai {} was assigned to clone {} with '
                    'mismatching v_gene, j_gene, cdr3_num_nts, or '
                    'subject.'.format(line['ai'], line['clone_id']))
            clone_info['seqs'].append({
                'ai': int(line['ai']),
                'sample_id': int(line['sample_id'])
            })
        session.commit()

    db_clone_ids = set([])
    for cid, clone_info in sorted(seen_clones.items()):
        clone_inst = clone_info['clone']
        session.add(clone_inst)
        session.flush()
        db_clone_ids.add(clone_inst.id)

        to_update = [{
            'sample_id': s['sample_id'],
            'ai': s['ai'],
            'clone_id': clone_inst.id
        } for s in clone_info['seqs']]
        session.bulk_update_mappings(Sequence, to_update)
    session.commit()
    generate_consensus(session, db_clone_ids)
    push_clone_ids(session)
Ejemplo n.º 2
0
 def assign_clones(self, df):
     seq = df.iloc[0]
     new_clone = Clone(subject_id=int(seq.subject_id),
                       v_gene=seq.v_gene,
                       j_gene=seq.j_gene,
                       cdr3_num_nts=int(seq.cdr3_num_nts))
     self.session.add(new_clone)
     self.session.flush()
     to_update = [{
         'sample_id': s.sample_id,
         'ai': s.ai,
         'clone_id': new_clone.id
     } for _, s in df.iterrows()]
     self.session.bulk_update_mappings(Sequence, to_update)
     return int(new_clone.id)
Ejemplo n.º 3
0
    def bcell_clones(self, bucket):
        clones = OrderedDict()
        consensus_needed = set([])
        query = self.get_query(bucket, True)

        if query.count() > 0:
            for seq in query:
                if seq.clone_id not in clones:
                    clones[seq.clone_id] = []
                clones[seq.clone_id].append(seq)
            if None in clones:
                for seq_to_add in clones[None]:
                    for clone_id, existing_seqs in clones.iteritems():
                        if clone_id is None:
                            continue
                        if similar_to_all(seq_to_add, existing_seqs,
                                          self._min_similarity):
                            existing_seqs.append(seq_to_add)
                            break
                    else:
                        new_clone = Clone(subject_id=seq.subject_id,
                                          v_gene=seq.v_gene,
                                          j_gene=seq.j_gene,
                                          cdr3_num_nts=seq.cdr3_num_nts,
                                          _insertions=seq._insertions,
                                          _deletions=seq._deletions)
                        self._session.add(new_clone)
                        self._session.flush()
                        clones[new_clone.id] = [seq_to_add]
                del clones[None]

            for clone_id, seqs in clones.iteritems():
                to_update = [
                    {
                        'sample_id': s.sample_id,
                        'ai': s.ai,
                        'clone_id': clone_id
                    } for s in seqs if s.clone_id is None
                ]
                if len(to_update) > 0:
                    self._session.bulk_update_mappings(Sequence, to_update)
                    consensus_needed.add(clone_id)
        generate_consensus(self._session, consensus_needed)
        self._tasks += 1
        if self._tasks % 100 == 0:
            self._session.commit()
            self.info('Collapsed {} buckets'.format(self._tasks))
Ejemplo n.º 4
0
    def run_bucket(self, bucket):
        updates = []
        consensus_needed = set([])

        seqs = self.get_bucket_seqs(bucket, sort=False)
        if seqs.count() > 0:
            cdr3_start = CDR3_OFFSET
            if bucket._insertions:
                cdr3_start += sum(
                    (i[1] for i in bucket._insertions)
                )
            germline = seqs[0].germline
            germline = ''.join((
                germline[:cdr3_start],
                '-' * bucket.cdr3_num_nts,
                germline[cdr3_start + bucket.cdr3_num_nts:]
            ))
            phylo = PhylogeneticTree(
                germline, seqs,
                min_mut_occurrence=self.min_mut_occurrence,
                min_mut_samples=self.min_mut_samples,
            )
            phylo.run(self.session, self.clearcut_path)

            for subtree in cut_tree(phylo.tree, self.mut_cutoff):
                new_clone = Clone(
                      subject_id=bucket.subject_id,
                      v_gene=bucket.v_gene,
                      j_gene=bucket.j_gene,
                      cdr3_num_nts=bucket.cdr3_num_nts,
                      _insertions=bucket._insertions,
                      _deletions=bucket._deletions
                )
                self.session.add(new_clone)
                self.session.flush()
                consensus_needed.add(new_clone.id)
                updates.extend([{
                    'sample_id': s[0],
                    'ai': s[1],
                    'clone_id': new_clone.id
                } for s in get_seq_pks(subtree)])

        if len(updates) > 0:
            self.session.bulk_update_mappings(Sequence, updates)
        generate_consensus(self.session, consensus_needed)
Ejemplo n.º 5
0
    def run_bucket(self, bucket):
        clones = OrderedDict()
        consensus_needed = set([])
        query = self.get_bucket_seqs(bucket, sort=True)

        if query.count() > 0:
            for seq in query:
                if seq.clone_id not in clones:
                    clones[seq.clone_id] = []
                clones[seq.clone_id].append(seq)
            if None in clones:
                for seq_to_add in clones[None]:
                    for clone_id, existing_seqs in clones.items():
                        if clone_id is None:
                            continue
                        if similar_to_all(seq_to_add, existing_seqs,
                                          self.level, self.min_similarity):
                            existing_seqs.append(seq_to_add)
                            break
                    else:
                        new_clone = Clone(subject_id=seq.subject_id,
                                          v_gene=seq.v_gene,
                                          j_gene=seq.j_gene,
                                          cdr3_num_nts=seq.cdr3_num_nts)
                        self.session.add(new_clone)
                        self.session.flush()
                        clones[new_clone.id] = [seq_to_add]
                del clones[None]

            for clone_id, seqs in clones.items():
                to_update = [
                    {
                        'sample_id': s.sample_id,
                        'ai': s.ai,
                        'clone_id': clone_id
                    } for s in seqs if s.clone_id is None
                ]
                if len(to_update) > 0:
                    self.session.bulk_update_mappings(Sequence, to_update)
                    consensus_needed.add(clone_id)
        generate_consensus(self.session, consensus_needed)
Ejemplo n.º 6
0
    def run_bucket(self, bucket):
        updates = []
        consensus_needed = set([])

        seqs = self.get_bucket_seqs(bucket, sort=False)
        if seqs.count() > 0:
            cdr3_start = CDR3_OFFSET
            if bucket._insertions:
                cdr3_start += sum((i[1] for i in bucket._insertions))
            germline = seqs[0].germline
            germline = ''.join(
                (germline[:cdr3_start], '-' * bucket.cdr3_num_nts,
                 germline[cdr3_start + bucket.cdr3_num_nts:]))
            lineage = LineageWorker(self.session,
                                    clearcut.get_newick,
                                    self.min_mut_copies,
                                    self.min_mut_samples,
                                    exclude_stops=False,
                                    post_tree_hook=clearcut.minimize_tree)
            tree = lineage.get_tree(germline, seqs)

            for subtree in cut_tree(tree, self.mut_cutoff):
                new_clone = Clone(subject_id=bucket.subject_id,
                                  v_gene=bucket.v_gene,
                                  j_gene=bucket.j_gene,
                                  cdr3_num_nts=bucket.cdr3_num_nts,
                                  _insertions=bucket._insertions,
                                  _deletions=bucket._deletions)
                self.session.add(new_clone)
                self.session.flush()
                consensus_needed.add(new_clone.id)
                updates.extend([{
                    'sample_id': s[0],
                    'ai': s[1],
                    'clone_id': new_clone.id
                } for s in get_seq_pks(subtree)])

        if len(updates) > 0:
            self.session.bulk_update_mappings(Sequence, updates)
        generate_consensus(self.session, consensus_needed)
Ejemplo n.º 7
0
    def tcell_clones(self, bucket):
        updates = []
        clones = OrderedDict()
        consensus_needed = set([])

        for seq in self.get_query(bucket, False):
            key = (seq.v_gene, seq.j_gene, seq.cdr3_nt)
            if key in clones:
                clone = clones[key]
            else:
                for test_clone in clones.values():
                    same_bin = (test_clone.v_gene == key[0] and
                                test_clone.j_gene == key[1] and
                                test_clone.cdr3_num_nts == len(key[2]))
                    if same_bin and dnautils.equal(test_clone.cdr3_nt, key[2]):
                        clone = test_clone
                        break
                else:
                    new_clone = Clone(subject_id=seq.subject_id,
                                      v_gene=seq.v_gene,
                                      j_gene=seq.j_gene,
                                      cdr3_nt=seq.cdr3_nt,
                                      cdr3_num_nts=seq.cdr3_num_nts,
                                      _insertions=seq._insertions,
                                      _deletions=seq._deletions)
                    clones[key] = new_clone
                    self._session.add(new_clone)
                    self._session.flush()
                    clone = new_clone
                    consensus_needed.add(new_clone.id)
            updates.append({
                'sample_id': seq.sample_id,
                'ai': seq.ai,
                'clone_id': clone.id
            })

        if len(updates) > 0:
            self._session.bulk_update_mappings(Sequence, updates)
        generate_consensus(self._session, consensus_needed)