def load(virus):
    if virus == 'h1':
        from escape import load_doud2018
        seq, seqs_escape = load_doud2018()
        train_fname = 'target/flu/clusters/all_h1.fasta'
        mut_fname = 'target/flu/mutation/mutations_h1.fa'
        anchor_id = ('gb:LC333185|ncbiId:BBB04702.1|UniProtKB:-N/A-|'
                     'Organism:Influenza')
    elif virus == 'h3':
        from escape import load_lee2019
        seq, seqs_escape = load_lee2019()
        train_fname = 'target/flu/clusters/all_h3.fasta'
        mut_fname = 'target/flu/mutation/mutations_h3.fa'
        anchor_id = 'Reference_Perth2009_HA_coding_sequence'
    elif virus == 'bg505':
        from escape import load_dingens2019
        seq, seqs_escape = load_dingens2019()
        train_fname = 'target/hiv/clusters/all_BG505.fasta'
        mut_fname = 'target/hiv/mutation/mutations_hiv.fa'
        anchor_id = 'A1.KE.-.BG505_W6M_ENV_C2.DQ208458'
    elif virus == 'sarscov2':
        from escape import load_baum2020
        seq, seqs_escape = load_baum2020()
        train_fname = 'target/cov/clusters/all_sarscov2.fasta'
        mut_fname = 'target/cov/mutation/mutations_sarscov2.fa'
        anchor_id = 'YP_009724390.1'
    elif virus == 'cov2rbd':
        from escape import load_greaney2020
        seq, seqs_escape = load_greaney2020()
        train_fname = 'target/cov/clusters/all_sarscov2.fasta'
        mut_fname = 'target/cov/mutation/mutations_sarscov2.fa'
        anchor_id = 'YP_009724390.1'
    else:
        raise ValueError('invalid option {}'.format(virus))

    return seq, seqs_escape, train_fname, mut_fname, anchor_id
Beispiel #2
0
def cached_escape(cache_fname,
                  beta,
                  cutoff=None,
                  expr_cutoff=None,
                  bind_cutoff=None,
                  plot=True,
                  namespace='semantics'):
    if 'flu_h1' in cache_fname:
        from escape import load_doud2018
        if cutoff is None:
            wt_seq, seqs_escape = load_doud2018()
        else:
            wt_seq, seqs_escape = load_doud2018(survival_cutoff=cutoff)
    elif 'flu_h3' in cache_fname:
        from escape import load_lee2019
        if cutoff is None:
            wt_seq, seqs_escape = load_lee2019()
        else:
            wt_seq, seqs_escape = load_lee2019(survival_cutoff=cutoff)
    elif 'hiv' in cache_fname:
        from escape import load_dingens2019
        if cutoff is None:
            wt_seq, seqs_escape = load_dingens2019()
        else:
            wt_seq, seqs_escape = load_dingens2019(survival_cutoff=cutoff)
    elif '_cov_' in cache_fname:
        from escape import load_baum2020
        wt_seq, seqs_escape = load_baum2020()
    elif 'cov2rbd' in cache_fname:
        from escape import load_greaney2020
        if cutoff is None:
            wt_seq, seqs_escape = load_greaney2020()
        elif expr_cutoff is not None:
            wt_seq, seqs_escape = load_greaney2020(expr_cutoff=expr_cutoff)
        else:
            wt_seq, seqs_escape = load_greaney2020(survival_cutoff=cutoff)
    else:
        raise ValueError('invalid option {}'.format(cache_fname))

    prob, change, escape_idx, viable_idx = [], [], [], []
    with open(cache_fname) as f:
        f.readline()
        for line in f:
            fields = line.rstrip().split('\t')
            pos = int(fields[0])
            if 'rbd' in cache_fname:
                if pos < 330 or pos > 530:
                    continue
            if fields[2] in {'U', 'B', 'J', 'X', 'Z'}:
                continue
            aa_wt = fields[1]
            aa_mut = fields[2]
            assert (wt_seq[pos] == aa_wt)
            mut_seq = wt_seq[:pos] + aa_mut + wt_seq[pos + 1:]
            if mut_seq not in seqs_escape:
                continue
            prob.append(float(fields[3]))
            change.append(float(fields[4]))
            viable_idx.append(fields[5] == 'True')
            escape_idx.append(
                (mut_seq in seqs_escape)
                and (sum([m['significant']
                          for m in seqs_escape[mut_seq]]) > 0))

    prob, orig_prob = np.array(prob), np.array(prob)
    change, orig_change = np.array(change), np.array(change)
    escape_idx = np.array(escape_idx)
    viable_idx = np.array(viable_idx)

    acquisition = ss.rankdata(change) + (beta * ss.rankdata(prob))

    pos_change_idx = change > 0

    pos_change_escape_idx = np.logical_and(pos_change_idx, escape_idx)
    escape_prob = prob[pos_change_escape_idx]
    escape_change = change[pos_change_escape_idx]
    prob = prob[pos_change_idx]
    change = change[pos_change_idx]

    log_prob, log_change = np.log10(prob), np.log10(change)
    log_escape_prob, log_escape_change = (np.log10(escape_prob),
                                          np.log10(escape_change))

    if plot:
        mkdir_p('figures')

        plt.figure()
        plt.scatter(log_prob,
                    log_change,
                    c=acquisition[pos_change_idx],
                    cmap='viridis',
                    alpha=0.3)
        plt.scatter(log_escape_prob,
                    log_escape_change,
                    c='red',
                    alpha=0.5,
                    marker='x')
        plt.xlabel(r'$ \log_{10}(\hat{p}(x_i | \mathbf{x}_{[N] ∖ \{i\} })) $')
        plt.ylabel(r'$ \log_{10}(\Delta \mathbf{\hat{z}}) $')
        plt.savefig('figures/{}_acquisition.png'.format(namespace), dpi=300)
        plt.close()

        rand_idx = np.random.choice(len(prob), len(escape_prob))
        plt.figure()
        plt.scatter(log_prob,
                    log_change,
                    c=acquisition[pos_change_idx],
                    cmap='viridis',
                    alpha=0.3)
        plt.scatter(log_prob[rand_idx],
                    log_change[rand_idx],
                    c='red',
                    alpha=0.5,
                    marker='x')
        plt.xlabel(r'$ \log_{10}(\hat{p}(x_i | \mathbf{x}_{[N] ∖ \{i\} })) $')
        plt.ylabel(r'$ \log_{10}(\Delta \mathbf{\hat{z}}) $')
        plt.savefig('figures/{}_acquisition_rand.png'.format(namespace),
                    dpi=300)
        plt.close()

    if len(escape_prob) == 0:
        print('No escape mutations found.')
        return

    acq_argsort = ss.rankdata(-acquisition)
    escape_rank_dist = acq_argsort[escape_idx]

    size = len(prob)
    print('Number of escape seqs: {} / {}'.format(len(escape_rank_dist),
                                                  sum(escape_idx)))
    print('Mean rank: {} / {}'.format(np.mean(escape_rank_dist), size))
    print('Median rank: {} / {}'.format(np.median(escape_rank_dist), size))
    print('Min rank: {} / {}'.format(np.min(escape_rank_dist), size))
    print('Max rank: {} / {}'.format(np.max(escape_rank_dist), size))
    print('Rank stdev: {} / {}'.format(np.std(escape_rank_dist), size))

    max_consider = len(prob)
    n_consider = np.array([i + 1 for i in range(max_consider)])

    n_escape = np.array(
        [sum(escape_rank_dist <= i + 1) for i in range(max_consider)])
    norm = max(n_consider) * max(n_escape)
    norm_auc = auc(n_consider, n_escape) / norm

    escape_rank_prob = ss.rankdata(-orig_prob)[escape_idx]
    n_escape_prob = np.array(
        [sum(escape_rank_prob <= i + 1) for i in range(max_consider)])
    norm_auc_prob = auc(n_consider, n_escape_prob) / norm

    escape_rank_change = ss.rankdata(-orig_change)[escape_idx]
    n_escape_change = np.array(
        [sum(escape_rank_change <= i + 1) for i in range(max_consider)])
    norm_auc_change = auc(n_consider, n_escape_change) / norm

    if plot:
        plt.figure()
        plt.plot(n_consider, n_escape)
        plt.plot(n_consider, n_escape_change, c='C0', linestyle='-.')
        plt.plot(n_consider, n_escape_prob, c='C0', linestyle=':')
        plt.plot(n_consider,
                 n_consider * (len(escape_prob) / len(prob)),
                 c='gray',
                 linestyle='--')

        plt.xlabel(r'$ \log_{10}() $')
        plt.ylabel(r'$ \log_{10}(\Delta \mathbf{\hat{z}}) $')

        plt.legend([
            r'$ \Delta \mathbf{\hat{z}} + ' +
            r'\beta \hat{p}(x_i | \mathbf{x}_{[N] ∖ \{i\} }) $,' +
            (' AUC = {:.3f}'.format(norm_auc)),
            r'$  \Delta \mathbf{\hat{z}} $ only,' +
            (' AUC = {:.3f}'.format(norm_auc_change)),
            r'$ \hat{p}(x_i | \mathbf{x}_{[N] ∖ \{i\} }) $ only,' +
            (' AUC = {:.3f}'.format(norm_auc_prob)),
            'Random guessing, AUC = 0.500'
        ])
        plt.xlabel('Top N')
        plt.ylabel('Number of escape mutations in top N')
        plt.savefig('figures/{}_consider_escape.png'.format(namespace),
                    dpi=300)
        plt.close()

    print('Escape semantics, beta = {} [{}]'.format(beta, namespace))

    norm_auc_p = compute_p(norm_auc, sum(escape_idx), len(escape_idx))

    print('AUC (CSCS): {}, P = {}'.format(norm_auc, norm_auc_p))
    print('AUC (semantic change only): {}'.format(norm_auc_change))
    print('AUC (grammaticality only): {}'.format(norm_auc_prob))

    print('{:.4g} (mean log prob), {:.4g} (mean log prob escape), '
          '{:.4g} (p-value)'.format(
              log_prob.mean(), log_escape_prob.mean(),
              ss.mannwhitneyu(log_prob,
                              log_escape_prob,
                              alternative='two-sided')[1]))
    print('{:.4g} (mean log change), {:.4g} (mean log change escape), '
          '{:.4g} (p-value)'.format(
              change.mean(), escape_change.mean(),
              ss.mannwhitneyu(change, escape_change,
                              alternative='two-sided')[1]))
Beispiel #3
0
            raise ValueError('Model must be trained or loaded '
                             'from checkpoint.')
        no_embed = {'hmm'}
        if args.model_name in no_embed:
            raise ValueError('Embeddings not available for models: {}'.format(
                ', '.join(no_embed)))
        analyze_embedding(args, model, seqs, vocabulary)

    if args.semantics:
        if args.checkpoint is None and not args.train:
            raise ValueError('Model must be trained or loaded '
                             'from checkpoint.')

        from escape import load_baum2020, load_greaney2020
        tprint('Baum et al. 2020...')
        seq_to_mutate, seqs_escape = load_baum2020()
        analyze_semantics(
            args,
            model,
            vocabulary,
            seq_to_mutate,
            seqs_escape,
            comb_batch=5000,
            prob_cutoff=0,
            beta=1.,
            plot_acquisition=True,
        )
        tprint('Greaney et al. 2020...')
        seq_to_mutate, seqs_escape = load_greaney2020()
        analyze_semantics(
            args,