Exemple #1
0
def set_fwr4_columns(record, database):
    j_call = record["j_call"]
    if not j_call or record["locus"] not in ALLOWED_LOCI:
        return

    cdr3_ref_end = database.j_cdr3_end(record["j_call"], record["locus"])
    cdr3_query_end = record["cdr3_end"]
    if cdr3_ref_end is None or not cdr3_query_end:
        return

    fwr4_nt = record["sequence"][cdr3_query_end : record["j_sequence_end"]]

    # This overwrites some existing columns
    record["fwr4_start"] = cdr3_query_end + 1
    record["fwr4_end"] = record["j_sequence_end"]
    record["fwr4"] = fwr4_nt
    record["fwr4_aa"] = nt_to_aa(fwr4_nt)

    # Compute FR4 mutation rate on nucleotide level
    germline = database.j[record["j_call"]][
        record["j_germline_start"] - 1 : record["j_germline_end"]
    ]
    dist = edit_distance(germline, fwr4_nt)
    record["FR4_SHM"] = 100.0 * dist / len(germline)

    # Compute FR4 amino acid mutation rate
    sequence_aa = record["fwr4_aa"]
    germline_aa = nt_to_aa(germline)
    dist = edit_distance(germline_aa, sequence_aa)
    record["J_aa_mut"] = 100.0 * dist / len(germline_aa)
Exemple #2
0
def assert_banded(s, t, maxdiff):
    banded_dist = edit_distance(s, t, maxdiff=maxdiff)
    true_dist = edit_distance(s, t)
    if true_dist > maxdiff:
        assert banded_dist > maxdiff
    else:
        assert banded_dist == true_dist
Exemple #3
0
def distances(sequences, band=0.2):
    """
    Compute all pairwise edit distances and return a square matrix.

    Entry [i,j] in the matrix is the edit distance between sequences[i]
    and sequences[j].
    """
    # Pre-compute distances between unique sequences
    unique_sequences = list(set(sequences))
    unique_distances = dict()  # maps (seq1, seq2) tuples to edit distance
    maxdiff = max((int(len(s) * band) for s in sequences), default=0)  # TODO double-check this
    for i, s in enumerate(unique_sequences):
        for j, t in enumerate(unique_sequences):
            if i < j:
                dist = min(maxdiff+1, edit_distance(s, t, maxdiff=maxdiff))
                unique_distances[(t, s)] = dist
                unique_distances[(s, t)] = dist

    # Fill the result matrix
    m = np.zeros((len(sequences), len(sequences)), dtype=float)
    for i, s in enumerate(sequences):
        for j, t in enumerate(sequences):
            if i < j:
                d = 0 if s == t else unique_distances[(s, t)]
                m[j, i] = m[i, j] = d
    return m
Exemple #4
0
    def fr4_aa_mutation_rate(self):
        if 'J' not in self.hits:
            return None
        j_subject_id = self.hits['J'].subject_id
        if self.chain not in self.CHAINS:
            return None
        cdr3_ref_end = self._database.j_cdr3_end(j_subject_id,
                                                 self.CHAINS[self.chain])
        if cdr3_ref_end is None:
            return None
        cdr3_query_end = self.hits['J'].query_position(
            reference_position=cdr3_ref_end)
        if cdr3_query_end is None:
            return None

        query = self.full_sequence[cdr3_query_end:self.hits['J'].query_end]
        try:
            query_aa = nt_to_aa(query)
        except ValueError:
            return None
        ref = self._database.j[j_subject_id][cdr3_ref_end:self.hits['J'].
                                             subject_end]
        try:
            ref_aa = nt_to_aa(ref)
        except ValueError:
            return None
        if not ref_aa:
            return None
        return 100. * edit_distance(ref_aa, query_aa) / len(ref_aa)
Exemple #5
0
def augment_group(table, v_shm_threshold=5, suffix='_mindiffrate'):
    """
    Add columns to the given table that contain percentage difference of VDJ_nt, VDJ_aa, CDR3_nt,
    CDR3_aa to the least mutated (in terms of V_SHM) sequence in this group.
    """
    columns = ['CDR3_nt', 'CDR3_aa', 'VDJ_nt', 'VDJ_aa']
    i = table.columns.get_loc('barcode')  # insert before this column
    for column in columns[::-1]:
        table.insert(i, column + suffix, None)

    if table.empty:
        return table

    # Find row whose V is least mutated
    root = table.loc[table['V_SHM'].idxmin()]
    if root['V_SHM'] > v_shm_threshold:
        return table

    for column in columns:
        root_seq = root[column]
        table[column + suffix] = table[column].apply(lambda s:
            round(edit_distance(root_seq, s, maxdiff=int(0.2 * len(root_seq))) / len(root_seq) * 100., 1)
        )

    return table
Exemple #6
0
    def should_discard(self, reference: Candidate, candidate: Candidate,
                       _same_gene: bool):
        """
        Compare a candidate to a reference candidate and decide whether it should be discarded.

        :param reference: The reference candidate. The decision this function makes is not about that one.
        :param candidate: The candidate on which to decide.
        :param dist: Edit distance between candidates
        :return: False if the candidate should be kept. Otherwise, the candidate shoud be discarded
        and a non-empty string with a reason describing why is returned.
        """
        # When computing edit distance between the two sequences, ignore the
        # bases in the 3' end that correspond to the CDR3
        s_no_cdr3 = reference.sequence[:reference.cdr3_start]
        t_no_cdr3 = candidate.sequence[:candidate.cdr3_start]
        if len(s_no_cdr3) != len(t_no_cdr3):
            t_prefix = t_no_cdr3[:len(s_no_cdr3)]
            t_suffix = t_no_cdr3[-len(s_no_cdr3):]
            dist_prefix = edit_distance(s_no_cdr3, t_prefix, 1)
            dist_suffix = edit_distance(s_no_cdr3, t_suffix,
                                        1)  # TODO prefix and suffix?
            dist_no_cdr3 = min(dist_prefix, dist_suffix)
        else:
            dist_no_cdr3 = edit_distance(s_no_cdr3, t_no_cdr3, 1)

        if dist_no_cdr3 > 1:
            # Cross-mapping is unlikely if the edit distance is larger than 1
            return None
        if not reference.is_database or not candidate.is_database:
            # Cross-mapping can only occur if both sequences are in the database
            return None

        total_count = (reference.cluster_size + candidate.cluster_size)
        if total_count == 0:
            return False
        ratio = candidate.cluster_size / total_count
        if candidate.cluster_size_is_accurate and ratio < self._ratio:
            # candidate is probably a cross-mapping artifact of the higher-expressed ref
            return f'xmap_ratio={ratio:.4f},other={reference.name}'
        return False
Exemple #7
0
    def merged(self, s, t):
        """
        Merge two sequences if they overlap. If they should not be merged,
        None is returned.
        """
        # TODO copy-and-pasted from germlinefilter
        #
        # Check allele ratio. Somewhat similar to cross-mapping, but
        # this uses sequence names to decide whether two genes can be
        # alleles of each other and the ratio is between the CDR3s_exact
        # values
        if self._allele_ratio and is_same_gene(s.name, t.name):
            for u, v in [(s, t), (t, s)]:
                if v.unique_CDR3 == 0:
                    continue
                ratio = u.unique_CDR3 / v.unique_CDR3
                if ratio < self._allele_ratio:
                    # logger.info('Allele ratio %.4f too low for %r compared to %r',
                    #     ratio, u.name, v.name)
                    return v

        if self._cross_mapping_ratio:
            # When checking for cross mapping, ignore overhanging bases in the 5' end.
            # Example:
            # ---ACTACGACTA...
            # XXX|||||X||||
            # ATTACTACTACTA...
            if len(t.sequence) < len(s.sequence):
                t, s = s, t  # s is now the shorter sequence
            t_seq = t.sequence[len(t.sequence) - len(s.sequence):]
            s_seq = s.sequence
            dist = edit_distance(s_seq, t_seq, 1)
            if dist > 1:
                return None
            total_occ = (s.exact_occ + t.exact_occ)
            if total_occ == 0:
                return None
            for u, v in [(s, t), (t, s)]:
                ratio = u.exact_occ / total_occ
                if ratio < self._cross_mapping_ratio:
                    # u is probably a cross-mapping artifact of the higher-expressed v
                    logger.info(
                        '%r is a cross-mapping artifact of %r (ratio %.4f)',
                        u.name, v.name, ratio)
                    return v

        return None
Exemple #8
0
    def closest(self, sequence):
        """
        Search for the whitelist sequence that is closest to the given sequence.

        Return tuple (distance, name).
        """
        if sequence in self._sequences:
            return 0, self._sequences[sequence]
        mindist = len(sequence)
        distances = []
        for seq, name in self._sequences.items():
            ed = edit_distance(seq, sequence, maxdiff=mindist)
            distances.append((ed, name))
            if ed == 1:
                # We know ed does not get smaller because the
                # 'sequence in whitelist' check
                # above covers that
                return ed, name
            mindist = min(mindist, ed)
        distance, name = min(distances)
        return distance, name
Exemple #9
0
def set_shm_columns(record, database):
    """
    Compute SHM (actually mutation rate on nucleotide level) for
    all regions on V
    """
    for airr_col, region in (
        ("fwr1", "FR1"),
        ("cdr1", "CDR1"),
        ("fwr2", "FR2"),
        ("cdr2", "CDR2"),
        ("fwr3", "FR3"),
    ):
        start = record[airr_col + "_start"]
        end = record[airr_col + "_end"]
        if start is None or end is None:
            record[region + "_SHM"] = None
            continue
        sequence = record["sequence"][start - 1 : end]
        germline = database.v_regions_nt[record["v_call"]].get(region)
        dist = edit_distance(germline, sequence)
        record[region + "_SHM"] = 100.0 * dist / len(germline)
Exemple #10
0
def main(args):
    if args.database:
        with dnaio.open(args.database) as fr:
            database = list(fr)
        logger.info('Read %d sequences from %r', len(database), args.database)
    else:
        database = None
    column = {'V': 'V_nt', 'J': 'J_nt', 'D': 'D_region'}[args.gene]
    other = 'V' if args.gene in ('D', 'J') else 'J'
    other_gene = other.lower() + '_call'
    other_errors = other + '_errors'
    table = read_table(args.table,
                       usecols=[
                           'count', 'v_call', 'd_call', 'j_call', 'V_errors',
                           'J_errors', 'J_covered', column, 'cdr3'
                       ])
    logger.info('Table with %s rows read', len(table))

    if args.j_coverage is None and args.gene == 'J':
        args.j_coverage = 90
    if args.j_coverage:
        table = table[table['J_covered'] >= args.j_coverage]
        logger.info('Keeping %s rows that have J_covered >= %s', len(table),
                    args.j_coverage)
    if args.perfect_matches:
        table = table[table[other_errors] == 0]
        logger.info('Keeping %s rows that have no %s mismatches', len(table),
                    other)

    if args.merge is None:
        args.merge = args.gene == 'D'
    if args.min_count is None:
        args.min_count = {
            'J': 1,
            'D': 10,
            'V': 100
        }[args.gene]  # TODO J is fine, but are D and V?

    if args.gene == 'D':
        candidates = sequence_candidates(table,
                                         column,
                                         minimum_length=args.d_core_length,
                                         core=args.d_core)
    elif args.gene == 'J':
        candidates = sequence_candidates(
            table, column, minimum_length=MINIMUM_CANDIDATE_LENGTH)
    else:
        candidates = sequence_candidates(
            table, column, minimum_length=MINIMUM_CANDIDATE_LENGTH)

    candidates = list(candidates)
    logger.info('Collected %s unique %s sequences', len(candidates), args.gene)

    # Add whitelisted sequences
    if database:
        whitelist = make_whitelist(table, database, args.gene,
                                   args.allele_ratio)
        missing_whitelisted = set(whitelist) - set(c.sequence
                                                   for c in candidates)
        for sequence in missing_whitelisted:
            candidates.append(Candidate(None, sequence))
        logger.info('Added %d whitelisted sequence%s',
                    len(missing_whitelisted),
                    's' if len(missing_whitelisted) != 1 else '')

    candidates = list(discard_substring_occurrences(candidates))
    logger.info(
        'Removing candidate sequences that occur within others results in %s candidates',
        len(candidates))
    candidates = [
        candidate for candidate in candidates if 'N' not in candidate.sequence
    ]
    logger.info('Removing candidates containing "N" results in %s candidates',
                len(candidates))

    if args.merge:
        logger.info('Merging overlapping sequences ...')
        # Merge candidate sequences that overlap. If one candidate is longer than
        # another, this is typically a sign that IgBLAST has not extended the
        # alignment long enough.
        merger = OverlappingSequenceMerger()
        for candidate in candidates:
            merger.add(candidate)
        logger.info('After merging overlapping %s sequences, %s remain',
                    args.gene, len(merger))
        candidates = list(merger)
        logger.info('%d candidates', len(candidates))
    del table

    # Assign names etc.
    if database:
        for candidate in candidates:
            distances = [(edit_distance(db.sequence, candidate.sequence), db)
                         for db in database]
            candidate.db_distance, closest = min(distances, key=lambda x: x[0])
            candidate.db_name = closest.name

            if candidate.db_distance == 0:
                candidate.name = closest.name
            else:
                # Exact db sequence not found, is there one that contains
                # this candidate as a substring?
                for db_record in database:
                    index = db_record.sequence.find(candidate.sequence)
                    if index == -1:
                        continue
                    if args.gene == 'D':
                        start = db_record.sequence.find(candidate.sequence)
                        prefix = db_record.sequence[:start]
                        suffix = db_record.sequence[start +
                                                    len(candidate.sequence):]
                        candidate.missing = '{}...{}'.format(prefix, suffix)
                    else:
                        # Replace this record with the full-length version
                        candidate.sequence = db_record.sequence
                        candidate.db_distance = 0
                    candidate.name = db_record.name
                    break
                else:
                    candidate.name = unique_name(closest.name,
                                                 candidate.sequence)
    else:
        for candidate in candidates:
            candidate.name = unique_name(args.gene, candidate.sequence)

    logger.info('Counting occurrences ...')
    if args.gene == 'D':
        search_columns = ['np1', 'D_region', 'np2']
    elif args.gene == 'J':
        search_columns = ['np2', 'J_nt']
    else:
        search_columns = ['sequence']
    candidates = count_occurrences(candidates, args.table, search_columns,
                                   other_gene, other_errors, args.merge,
                                   args.perfect_matches)

    # Filter by allele ratio
    if args.allele_ratio or args.cross_mapping_ratio:
        arm = AlleleRatioMerger(args.allele_ratio, args.cross_mapping_ratio)
        arm.extend(candidates)
        candidates = list(arm)
        logger.info(
            'After filtering by allele ratio and/or cross-mapping ratio, %d candidates remain',
            len(candidates))

    candidates = sorted(candidates, key=lambda c: c.name)
    candidates = [
        c for c in candidates
        if c.exact_occ >= args.min_count or c.db_distance == 0
    ]
    print_table(candidates, other_gene, missing=args.gene == 'D')

    if args.fasta:
        with open(args.fasta, 'w') as f:
            for candidate in sorted(candidates, key=lambda r: r.name):
                print('>{}\n{}'.format(candidate.name, candidate.sequence),
                      file=f)

    logger.info('Wrote %d genes', len(candidates))
Exemple #11
0
 def _percent_identity(self):
     # FIXME This is not quite how IgBLAST computes percent identity
     if not self.nt_reference or not self.nt_sequence:
         return None
     dist = edit_distance(self.nt_reference, self.nt_sequence)
     return 100. - 100. * dist / len(self.nt_reference)
Exemple #12
0
def test_edit_distance():
    assert edit_distance('', '') == 0
    assert edit_distance('', 'A') == 1
    assert edit_distance('A', 'B') == 1
    assert edit_distance('A', 'A') == 0
    assert edit_distance('A', 'AB') == 1
    assert edit_distance('BA', 'AB') == 2
    for s, t in STRING_PAIRS + RANDOM_STRING_PAIRS:
        assert edit_distance(s, '') == len(s)
        assert edit_distance('', s) == len(s)
        assert edit_distance(s, t) == edit_distance(t, s)
        assert edit_distance(s, t) == py_edit_distance(s, t)
Exemple #13
0
    def __call__(self, args):
        """
        Discover new V genes. args is a tuple (gene, group)
        gene -- name of the gene
        assignments -- a pandas DataFrame with the assignments to the gene
        """
        gene, assignments = args
        self.set_random_seed(gene)
        siblings = SiblingMerger()
        for sibling in self._collect_siblings(gene, assignments):
            siblings.add(sibling)

        candidates = []
        for sibling_info in siblings:
            sibling = sibling_info.sequence
            n_bases = sibling.count('N')
            if n_bases > self.max_n_bases:
                logger.debug('Sibling %s has too many N bases',
                             sibling_info.name)
                continue

            # Sequence without the CDR3-covering part
            sibling_no_cdr3 = sibling[:self._guess_cdr3_start(assignments)]
            group_exact_v = assignments[assignments.V_no_CDR3 ==
                                        sibling_no_cdr3]

            group_full_exact_v = assignments[
                assignments['VDJ_nt'].str.startswith(sibling, na=False)]
            groups = (
                ('window', sibling_info.group),
                ('exact', group_exact_v),
                ('full_exact', group_full_exact_v),
            )
            del sibling_no_cdr3

            # self.cdr3_counts are CDR3 counts of all CDR3s in the entire table.
            # We restrict this here to the counts for CDR3s belonging to
            # clusters other than the current one.
            other_cdr3_counts = self.cdr3_counts - Counter(
                s for s in sibling_info.group.cdr3 if s)
            info = dict()
            for key, group in groups:
                cdr3_counts = Counter(s for s in group.cdr3 if s)
                unique_cdr3 = len(cdr3_counts)
                shared_cdr3_ratio = safe_divide(
                    len(other_cdr3_counts & cdr3_counts), unique_cdr3)
                unique_j = len(set(s for s in group.j_call if s))
                clonotypes = self.count_clonotypes(group)
                unique_d = self.count_unique_d(group)
                unique_barcodes = self.count_unique_barcodes(group)
                count = len(group.index)
                read_names = list(group.sequence_id)
                info[key] = Groupinfo(count=count,
                                      unique_D=unique_d,
                                      unique_J=unique_j,
                                      unique_CDR3=unique_cdr3,
                                      shared_CDR3_ratio=shared_cdr3_ratio,
                                      clonotypes=clonotypes,
                                      read_names=read_names,
                                      unique_barcodes=unique_barcodes)
            if gene in self.database:
                database_diff = edit_distance(sibling, self.database[gene])
                database_changes = describe_nt_change(self.database[gene],
                                                      sibling)
            else:
                database_diff = None
                database_changes = None

            # Build the Candidate
            sequence_id = gene if database_diff == 0 else unique_name(
                gene, sibling)
            chain = self._guess_chain(sibling_info.group)
            cdr3_start = self._guess_cdr3_start(sibling_info.group)
            ratio = safe_divide(info['exact'].count, info['exact'].unique_CDR3)

            # Apply some very light filtering on non-database sequences
            if database_diff > 0 and info['exact'].count < 2:
                continue

            candidate = Candidate(
                name=sequence_id,
                source=gene,
                chain=chain,
                cluster=sibling_info.name,
                cluster_size=info['window'].count,
                Js=info['window'].unique_J,
                CDR3s=info['window'].unique_CDR3,
                exact=info['exact'].count,
                full_exact=info['full_exact'].count,
                barcodes_exact=info['exact'].unique_barcodes,
                Ds_exact=info['exact'].unique_D,
                Js_exact=info['exact'].unique_J,
                CDR3s_exact=info['exact'].unique_CDR3,
                clonotypes=info['exact'].clonotypes,
                CDR3_exact_ratio=ratio,
                CDR3_shared_ratio=info['exact'].shared_CDR3_ratio,
                N_bases=n_bases,
                database_diff=database_diff,
                database_changes=database_changes,
                has_stop=has_stop(sibling),
                CDR3_start=cdr3_start,
                consensus=sibling,
                read_names=info['exact'].read_names,
            )
            candidates.append(candidate)
        return candidates
Exemple #14
0
 def linked(s, t):
     return edit_distance(s, t, distance) <= distance