def msa_2_mutations(alignment_filepath, patient_zero, out_filepath, config):
  # date = config['date']
  patient_zero = config['patient_zero']
  data_src = config['data_source']
  # min_date = config['min_date']
  # unknown_val = config['unknown_value']
  # countries_fp = config['countries_fp']
  # divisions_fp = config['divisions_fp']
  # locations_fp = config['locations_fp']
  print(f"Loading alignment file at {alignment_filepath}")
  t0 = time.time()
  msa_data = bs.load_fasta(alignment_filepath, is_aligned=True, is_gzip=False)
  msa_load_time = time.time() - t0
  print(f"Identifying substitution-based mutations...")
  t0 = time.time()
  subs, _ = bm.identify_replacements_per_sample(msa_data, 
                                                # gisaid_meta,  
                                                gene2pos=bd.GENE2POS, 
                                                data_src=data_src,
                                                min_seq_len=20000,
                                                patient_zero=patient_zero
                                              #   test=is_test
                                                )
  subs_time = time.time() - t0
  print(f"Identifying deletion-based mutations...")
  t0 = time.time()
  dels, _ = bm.identify_deletions_per_sample(msa_data, 
                                            #  gisaid_meta,  
                                            gene2pos=bd.GENE2POS, 
                                            data_src=data_src, 
                                            min_del_len=1,
                                            max_del_len=500,
                                            min_seq_len=20000,
                                            patient_zero=patient_zero
                                          #    test=is_test
                                            )
  dels_time = time.time() - t0
  # QC FILTER: remove seqs with >500 nt deletions
  # dels = dels.loc[dels['del_positions'].str.len()<500]
  print(subs.shape)
  print(dels.shape)
  muts = pd.concat([subs, dels])
  muts['is_synonymous'] = False
  muts.loc[muts['ref_aa']==muts['alt_aa'], 'is_synonymous'] = True
  print(muts.shape)
  # muts = muts.astype(str) TAKES FOREVER
  # muts_filename = alignment_filepath.replace('.aligned.fasta', f'_{date}.mutations.csv')
  muts.to_csv(out_filepath, index=False)
  del muts, subs, dels
  gc.collect();
  print(f"Mutations extracted from {alignment_filepath} and saved in {out_filepath}\n")
  return 0
Exemple #2
0
    )
minimap_time = time.time() - t0
print(f"Generating alignment from SAM data...")
t0 = time.time()
if not Path.isfile(alignment_filepath):
    alignment_filepath = bs.run_datafunk(sam_filepath, ref_fasta,
                                         alignment_filepath)
    print(f"Alignment generated and saved at {alignment_filepath} \n")
datafunk_time = time.time() - t0
print(f"STEP 2: Counting variants...")
print(f"Loading alignment file at {alignment_filepath}")
t0 = time.time()
# cmd = f"gzip {alignment_filepath}"
# bs.run_command(cmd)
# alignment_filepath += '.gz'
msa_data = bs.load_fasta(alignment_filepath, is_aligned=True, is_gzip=False)
msa_load_time = time.time() - t0
print(f"Identifying substitution-based mutations...")
t0 = time.time()
subs, _ = bm.identify_replacements_per_sample(
    msa_data,
    gisaid_meta,
    bd.GENE2POS,
    data_src=data_src,
    #   test=is_test
)
subs.to_csv(subs_fp, index=False, compression='gzip')
# subs = pd.read_csv(subs_fp, compression='gzip')
try:
    subs_agg = bm.aggregate_replacements(subs, date, data_src='gisaid_feed')
    subs_agg.to_csv(subs_agg_fp, index=False, compression='gzip')
if not Path.isfile(fa_fp):
    fa_fp = bs.concat_fasta(in_alab_seqs, out_dir / 'alab_seqs')
print(f"Concatenated all sequences and wrote to {fa_fp}")
# align consensus sequences
msa_fp = Path(fa_fp.split('.')[0] + '_aligned.fa')
if not Path.isfile(msa_fp):
    print(f"Aligning sequences with reference...")
    msa_fp = bs.align_fasta_reference(fa_fp,
                                      msa_fp,
                                      ref_fp=ref_fp,
                                      num_cpus=num_cpus)
print(
    f"Multiple sequence alignment of A-lab samples with reference saved in {msa_fp}"
)
# msa2_fp = Path(fa_fp.split('.')[0] + '_aligned_absolute.fa')
# if not Path.isfile(msa2_fp):
#     print(f"Aligning sequences without reference...")
#     msa2_fp = bs.align_fasta(fa_fp, msa2_fp, num_cpus=num_cpus)
# print(f"Multiple sequence alignment of A-lab samples without reference saved in {msa2_fp}")
# Identify substitutions and deletions
msa_data = bs.load_fasta(msa_fp, is_aligned=True)
subs_wide = bm.identify_replacements(msa_data, in_alab_meta, data_src='alab')
subs_wide_fp = out_dir / f'alab_substitutions_wide_{date}.csv'
subs_wide.sort_values('num_samples', ascending=False).to_csv(subs_wide_fp,
                                                             index=False)
print(f"Substitution-based mutations of A-lab samples saved in {subs_wide_fp}")
dels_wide = bm.identify_deletions(msa_data, in_alab_meta, data_src='alab')
dels_wide_fp = out_dir / f'alab_deletions_wide_{date}.csv'
dels_wide.sort_values('num_samples', ascending=False).to_csv(dels_wide_fp,
                                                             index=False)
print(f"Deletion-based mutations of A-lab samples saved in {dels_wide_fp}")
 # tree_dir = out_dir/'trees'
 # if not Path.isdir(tree_dir):
 #     Path.mkdir(tree_dir);
 # tree_fp = msa_fp + '.treefile'
 # if not Path.isfile(Path(tree_fp)):
 #     tree_fp = compute_tree(msa_fp, num_cpus=num_cpus)
 # tree = load_tree(tree_fp, patient_zero)
 # # Plot and save basic tree
 # fig1 = visualize_tree(tree)
 # fig1.savefig(tree_dir/'basic_tree.pdf')
 # PLOT AND SAVE INDEL TREES
 # colors = list(mcolors.TABLEAU_COLORS.keys())
 # path to new github metadata
 meta_fp = out_dir / 'metadata.csv'
 # load multiple sequence alignment
 msa_data = bs.load_fasta(msa_fp, is_aligned=True)
 # identify insertions
 insertions = bm.identify_insertions(msa_data,
                                     meta_fp=meta_fp,
                                     patient_zero=patient_zero,
                                     min_ins_len=1,
                                     data_src='alab')
 # save insertion results to file
 insertions.to_csv(out_dir / 'insertions.csv', index=False)
 # identify substitution mutations
 substitutions = bm.identify_replacements(msa_data,
                                          meta_fp=meta_fp,
                                          data_src='alab',
                                          patient_zero=patient_zero)
 # save substitution results to file
 substitutions.to_csv(out_dir / 'replacements.csv', index=False)