コード例 #1
0
 def OutputStats(self, output_fname):
     fh = open(output_fname, 'w')
     fh.write(
         'LineageID\tLineageSizeBeforeCleaning\tNumNonTrivialSeqs\tMaxMultiplicity\tClosestV\tClosestJ\tRootId\tRootSeq\tRootCDR3\tRootDistanceFromGermline\n'
     )
     for l in sorted(self.clonal_lineages,
                     key=lambda s: len(s),
                     reverse=True):
         vj_ann = vj_annotator.VJGeneAnnotator(l)
         abundant_v = utils.GetBaseGeneName(
             vj_ann.GetAbundantGene(dataset.AnnotatedGene.V))
         abundant_j = utils.GetBaseGeneName(
             vj_ann.GetAbundantGene(dataset.AnnotatedGene.J))
         root_seq = self._GetRootSeq(l, abundant_v)
         num_shms_in_root = len(
             self.dataset.GetVSHMsOutsideCDR3(root_seq.id)) + len(
                 self.dataset.GetJSHMsOutsideCDR3(root_seq.id))
         fh.write(l.id() + '\t' + str(len(l)) + '\t' +
                  str(self._GetNumberNonTrivialSequences(l)) + '\t' +
                  str(self._GetHighestMultiplicity(l)) + '\t' + abundant_v +
                  '\t' + abundant_j + '\t' + root_seq.id + '\t' +
                  root_seq.seq + '\t' +
                  self.dataset.GetCDR3BySeqName(root_seq.id) + '\t' +
                  str(num_shms_in_root) + '\n')
     fh.close()
コード例 #2
0
 def SequenceIsGood(self, seq):
     v_gene = self.full_length_lineage.Dataset().GetGeneHitBySeqName(
         seq.id, dataset.AnnotatedGene.V)
     j_gene = self.full_length_lineage.Dataset().GetGeneHitBySeqName(
         seq.id, dataset.AnnotatedGene.J)
     return utils.GetBaseGeneName(
         v_gene) == self.most_freq_v and utils.GetBaseGeneName(
             j_gene) == self.most_freq_j
コード例 #3
0
 def _GetRootSeq(self, lineage, abundant_v):
     num_shms_all = sys.maxint
     root_seq_all = SeqRecord(Seq(''), id='')
     num_shms_prod = sys.maxint
     root_seq_prod = SeqRecord(Seq(''), id='')
     for seq in lineage.FullLengthSeqIdIter():
         if utils.GetBaseGeneName(
                 self.dataset.GetGeneHitBySeqName(
                     seq.id, dataset.AnnotatedGene.V)) != abundant_v:
             continue
         cur_num_shms = len(self.dataset.GetVSHMsOutsideCDR3(seq.id)) + len(
             self.dataset.GetJSHMsOutsideCDR3(seq.id))
         if cur_num_shms < num_shms_all:
             num_shms_all = cur_num_shms
             root_seq_all = seq
         cur_seq = seq.seq  #self.dataset.GetCDR3BySeqName(seq.id)
         aa_seq = str(Seq(cur_seq).translate())
         if aa_seq.find('*') != -1:
             continue
         if cur_num_shms < num_shms_prod:
             num_shms_prod = cur_num_shms
             root_seq_prod = seq
     if root_seq_prod.id != '':
         return root_seq_prod
     return root_seq_all
コード例 #4
0
ファイル: vj_annotator.py プロジェクト: mona7aq/immunotools
 def _UpdateGeneDict(self, gene_type, lineage):
     root_seq_id = lineage.RootSeqId()
     gene_name = utils.GetBaseGeneName(
         lineage.Dataset().GetGeneHitBySeqName(root_seq_id, gene_type))
     if gene_name not in self.gene_usage[gene_type]:
         self.gene_usage[gene_type][gene_name] = []
     self.gene_usage[gene_type][gene_name].append(
         lineage.Dataset().GetSHMsBySeqName(root_seq_id, gene_type))
コード例 #5
0
ファイル: vj_annotator.py プロジェクト: mona7aq/immunotools
 def _FindMostAbundantGene(self, gene_type):
     gene_dict = dict()  # gene name -> num sequences
     genes = []
     for seq in self.full_length_lineage.FullLengthSeqIdIter():
         gene_name = self.full_length_lineage.Dataset().GetGeneHitBySeqName(
             seq.id, gene_type)
         base_gene = utils.GetBaseGeneName(gene_name)
         genes.append(base_gene)
     self.gene_type_mults[gene_type] = Counter(genes)
コード例 #6
0
 def GetIdentifiedGenes(self):
     found_d_genes = set()
     for c in self.classifications:
         gene_ids = c.gene_ids
         if len(gene_ids) != 1:
             continue
         found_d_genes.add(utils.GetBaseGeneName(gene_ids[0]))
     missing_d_genes = set()
     for d in self.d_genes:
         d_base = utils.GetBaseGeneName(d.id)
         if d_base not in found_d_genes:
             missing_d_genes.add(d_base)
     print str(len(found_d_genes)) + " D genes are identified: " + str(
         ','.join([d for d in sorted(found_d_genes)]))
     print str(len(self.d_genes) -
               len(found_d_genes)) + ' D genes are missing: ' + str(
                   ','.join([d for d in sorted(missing_d_genes)]))
     return found_d_genes, missing_d_genes
コード例 #7
0
ファイル: vj_annotator.py プロジェクト: shulp2211/immunotools
 def _FindMostAbundantGene(self, gene_type):
     gene_dict = dict()  # gene name -> num sequences
     for seq in self.full_length_lineage.FullLengthSeqIdIter():
         gene_name = self.full_length_lineage.Dataset().GetGeneHitBySeqName(
             seq.id, gene_type)
         base_gene = utils.GetBaseGeneName(gene_name)
         if base_gene not in gene_dict:
             gene_dict[base_gene] = 0
         gene_dict[base_gene] += 1
     self.gene_type_mults[gene_type] = gene_dict
コード例 #8
0
    def _FindMostAbundantGene(self, gene_type):
        gene_dict = dict() # gene name -> num sequences
        for seq in self.full_length_lineage.FullLengthSeqIdIter():
            gene_name = self.full_length_lineage.Dataset().GetGeneHitBySeqName(seq.id, gene_type)
            base_name = utils.GetBaseGeneName(gene_name)
            if base_name not in gene_dict:
                gene_dict[base_name] = 0
            gene_dict[base_name] += 1
        most_freq_gene = max(gene_dict.iteritems(), key=operator.itemgetter(1))[0]
#        print "Most frequent " + str(gene_type.name) + ' gene: ' + most_freq_gene + ' (' + str(gene_dict[most_freq_gene]) + ' sequences)'
        return max(gene_dict.iteritems(), key=operator.itemgetter(1))[0]
コード例 #9
0
 def _CreateSubstringSegmentClassification(self, segment_type, segment,
                                           d_indices):
     if segment_type == SegmentType.AMBIGUOUS_GENE:
         return DSegmentClassification(segment_type, segment, '', [
             utils.GetBaseGeneName(self.d_genes[ind].id)
             for ind in d_indices
         ])
     main_d_gene = self.d_genes[d_indices[0]]
     segment_alignment_seq = self._GetAlignmentSeqForSubstring(
         segment, main_d_gene.seq)
     return DSegmentClassification(segment_type, segment_alignment_seq,
                                   main_d_gene.seq, [main_d_gene.id])
コード例 #10
0
 def _GetRootSeq(self, lineage, abundant_v):
     num_shms = sys.maxint
     root_seq = ''
     for seq in lineage.FullLengthSeqIdIter():
         if utils.GetBaseGeneName(
                 self.dataset.GetGeneHitBySeqName(
                     seq.id, dataset.AnnotatedGene.V)) != abundant_v:
             continue
         cur_num_shms = len(self.dataset.GetVSHMsOutsideCDR3(seq.id)) + len(
             self.dataset.GetJSHMsOutsideCDR3(seq.id))
         if cur_num_shms < num_shms:
             num_shms = cur_num_shms
             root_seq = seq
     return root_seq
コード例 #11
0
 def _SegmentIsAmbiguousGene(self, d_indices):
     d_base_names = set(
         [utils.GetBaseGeneName(self.d_genes[ind].id) for ind in d_indices])
     return len(d_base_names) > 1