def msa_2_mutations(alignment_filepath, patient_zero, out_filepath, config): # date = config['date'] patient_zero = config['patient_zero'] data_src = config['data_source'] # min_date = config['min_date'] # unknown_val = config['unknown_value'] # countries_fp = config['countries_fp'] # divisions_fp = config['divisions_fp'] # locations_fp = config['locations_fp'] print(f"Loading alignment file at {alignment_filepath}") t0 = time.time() msa_data = bs.load_fasta(alignment_filepath, is_aligned=True, is_gzip=False) msa_load_time = time.time() - t0 print(f"Identifying substitution-based mutations...") t0 = time.time() subs, _ = bm.identify_replacements_per_sample(msa_data, # gisaid_meta, gene2pos=bd.GENE2POS, data_src=data_src, min_seq_len=20000, patient_zero=patient_zero # test=is_test ) subs_time = time.time() - t0 print(f"Identifying deletion-based mutations...") t0 = time.time() dels, _ = bm.identify_deletions_per_sample(msa_data, # gisaid_meta, gene2pos=bd.GENE2POS, data_src=data_src, min_del_len=1, max_del_len=500, min_seq_len=20000, patient_zero=patient_zero # test=is_test ) dels_time = time.time() - t0 # QC FILTER: remove seqs with >500 nt deletions # dels = dels.loc[dels['del_positions'].str.len()<500] print(subs.shape) print(dels.shape) muts = pd.concat([subs, dels]) muts['is_synonymous'] = False muts.loc[muts['ref_aa']==muts['alt_aa'], 'is_synonymous'] = True print(muts.shape) # muts = muts.astype(str) TAKES FOREVER # muts_filename = alignment_filepath.replace('.aligned.fasta', f'_{date}.mutations.csv') muts.to_csv(out_filepath, index=False) del muts, subs, dels gc.collect(); print(f"Mutations extracted from {alignment_filepath} and saved in {out_filepath}\n") return 0
) minimap_time = time.time() - t0 print(f"Generating alignment from SAM data...") t0 = time.time() if not Path.isfile(alignment_filepath): alignment_filepath = bs.run_datafunk(sam_filepath, ref_fasta, alignment_filepath) print(f"Alignment generated and saved at {alignment_filepath} \n") datafunk_time = time.time() - t0 print(f"STEP 2: Counting variants...") print(f"Loading alignment file at {alignment_filepath}") t0 = time.time() # cmd = f"gzip {alignment_filepath}" # bs.run_command(cmd) # alignment_filepath += '.gz' msa_data = bs.load_fasta(alignment_filepath, is_aligned=True, is_gzip=False) msa_load_time = time.time() - t0 print(f"Identifying substitution-based mutations...") t0 = time.time() subs, _ = bm.identify_replacements_per_sample( msa_data, gisaid_meta, bd.GENE2POS, data_src=data_src, # test=is_test ) subs.to_csv(subs_fp, index=False, compression='gzip') # subs = pd.read_csv(subs_fp, compression='gzip') try: subs_agg = bm.aggregate_replacements(subs, date, data_src='gisaid_feed') subs_agg.to_csv(subs_agg_fp, index=False, compression='gzip')
if not Path.isfile(fa_fp): fa_fp = bs.concat_fasta(in_alab_seqs, out_dir / 'alab_seqs') print(f"Concatenated all sequences and wrote to {fa_fp}") # align consensus sequences msa_fp = Path(fa_fp.split('.')[0] + '_aligned.fa') if not Path.isfile(msa_fp): print(f"Aligning sequences with reference...") msa_fp = bs.align_fasta_reference(fa_fp, msa_fp, ref_fp=ref_fp, num_cpus=num_cpus) print( f"Multiple sequence alignment of A-lab samples with reference saved in {msa_fp}" ) # msa2_fp = Path(fa_fp.split('.')[0] + '_aligned_absolute.fa') # if not Path.isfile(msa2_fp): # print(f"Aligning sequences without reference...") # msa2_fp = bs.align_fasta(fa_fp, msa2_fp, num_cpus=num_cpus) # print(f"Multiple sequence alignment of A-lab samples without reference saved in {msa2_fp}") # Identify substitutions and deletions msa_data = bs.load_fasta(msa_fp, is_aligned=True) subs_wide = bm.identify_replacements(msa_data, in_alab_meta, data_src='alab') subs_wide_fp = out_dir / f'alab_substitutions_wide_{date}.csv' subs_wide.sort_values('num_samples', ascending=False).to_csv(subs_wide_fp, index=False) print(f"Substitution-based mutations of A-lab samples saved in {subs_wide_fp}") dels_wide = bm.identify_deletions(msa_data, in_alab_meta, data_src='alab') dels_wide_fp = out_dir / f'alab_deletions_wide_{date}.csv' dels_wide.sort_values('num_samples', ascending=False).to_csv(dels_wide_fp, index=False) print(f"Deletion-based mutations of A-lab samples saved in {dels_wide_fp}")
# tree_dir = out_dir/'trees' # if not Path.isdir(tree_dir): # Path.mkdir(tree_dir); # tree_fp = msa_fp + '.treefile' # if not Path.isfile(Path(tree_fp)): # tree_fp = compute_tree(msa_fp, num_cpus=num_cpus) # tree = load_tree(tree_fp, patient_zero) # # Plot and save basic tree # fig1 = visualize_tree(tree) # fig1.savefig(tree_dir/'basic_tree.pdf') # PLOT AND SAVE INDEL TREES # colors = list(mcolors.TABLEAU_COLORS.keys()) # path to new github metadata meta_fp = out_dir / 'metadata.csv' # load multiple sequence alignment msa_data = bs.load_fasta(msa_fp, is_aligned=True) # identify insertions insertions = bm.identify_insertions(msa_data, meta_fp=meta_fp, patient_zero=patient_zero, min_ins_len=1, data_src='alab') # save insertion results to file insertions.to_csv(out_dir / 'insertions.csv', index=False) # identify substitution mutations substitutions = bm.identify_replacements(msa_data, meta_fp=meta_fp, data_src='alab', patient_zero=patient_zero) # save substitution results to file substitutions.to_csv(out_dir / 'replacements.csv', index=False)