def write_region_alignment(headers, seqs, fn, start, end, master_ind): relative_start = max(0, index_ignoring_gaps(seqs[master_ind], start, 0)) relative_end = index_ignoring_gaps(seqs[master_ind], end, 0) region_seqs = [seq[relative_start:relative_end + 1] for seq in seqs] write_fasta.write_fasta(headers, region_seqs, fn, gz=True)
def mask(fn, masked_fn, intervals_fn): headers, seqs = read_fasta.read_fasta(fn) seq = list(seqs[0]) intervals = read_intervals(intervals_fn) for start, end in intervals: for i in range(start, end + 1): seq[i] = gp.unsequenced_symbol seq = ''.join(seq) write_fasta.write_fasta(headers, [seq], masked_fn)
r.append(','.join(pidents[k])) r = tuple(r) all_rankings[best_key].append(r) # write reference genes and paralogs and all introgressed # genes to file and then align fn = gp.analysis_out_dir_absolute + tag + '/paralogs/' + \ gene + gp.fasta_suffix headers = ['S288c ' + gene, 'CBS432 ' + gene, 'S288c ' + paralog, 'CBS432 ' + paralog] seqs = [cer_seq.lower(), par_seq.lower(), cer_paralog_seq.lower(), par_paralog_seq.lower()] for strain in strain_intd_seqs: headers.append(strain + ' ' + gene) seqs.append(strain_intd_seqs[strain]) write_fasta.write_fasta(headers, seqs, fn) aligned_fn = fn.replace(gp.fasta_suffix, gp.alignment_suffix) cmd_string = gp.mafft_install_path + '/mafft ' + \ ' --quiet --reorder --preservecase ' + \ fn + ' > ' + aligned_fn os.system(cmd_string) f = open('check_paralogs_out.tsv', 'w') f.write('category\tnum_total_genes\tnum_unique_genes\n') for key in keys: f.write(key + '\t') f.write(str(len(all_rankings[key])) + '\t') num_unique_genes = len(set([x[0] for x in all_rankings[key]])) f.write(str(num_unique_genes) + '\n')
print('writing all gene sequences to file') keys = sorted(strain_gene_seqs.keys()) headers = [ key + ' ' + strain_gene_seqs[key][0] + ' ' + strain_gene_seqs[key][-1] for key in keys ] seqs = [strain_gene_seqs[key][1] for key in keys] strains = [ref] + keys headers = [ref + ' ' + gene + ' ' + ref_strand] + headers seqs = [ref_gene_seq] + seqs gene_seqs_fn = gp.analysis_out_dir_absolute + tag + '/genes/' + gene + '/' + \ gene + gp.fasta_suffix if not os.path.isdir(gp.analysis_out_dir_absolute + tag + '/genes/' + gene): os.makedirs(gp.analysis_out_dir_absolute + tag + '/genes/' + gene) write_fasta.write_fasta(headers, seqs, gene_seqs_fn) suffixes = ['', '_filtered'] for suffix in suffixes: print(' '.join(['finding', suffix, 'regions that overlap gene'])) # read in filtered regions fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \ 'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt' regions, _ = read_table.read_table_rows(fn_regions, '\t') # figure out which strains are introgressed/which regions overlap gene fn_genes_regions = gp.analysis_out_dir_absolute + tag + '/' + \ 'genes_for_each_region_chr' + chrm + '_' + tag + '.txt' region_to_genes = \ gene_predictions.read_genes_for_each_region_summary(fn_genes_regions) # strains = [x[0] for x in s]
print('writing all range sequences to file') keys = sorted(strain_range_seqs.keys()) headers = [ key + ' ' + str(strain_range_seqs[key][1]) + ':' + str(strain_range_seqs[key][2]) for key in keys ] seqs = [strain_range_seqs[key][0] for key in keys] strains = [ref] + keys headers = [ref + ' ' + str(start) + ':' + str(end)] + headers seqs = [ref_range_seq] + seqs label = 'chr' + chrm + '_' + str(start) + '-' + str(end) range_seqs_fn = gp.analysis_out_dir_absolute + tag + '/ranges/' + \ label + '/' + label + gp.fasta_suffix if not os.path.isdir(gp.analysis_out_dir_absolute + tag + '/ranges/' + label): os.makedirs(gp.analysis_out_dir_absolute + tag + '/ranges/' + label) write_fasta.write_fasta(headers, seqs, range_seqs_fn) suffixes = ['', '_filtered'] for suffix in suffixes: print(' '.join(['finding', suffix, 'regions that overlap range'])) # read in filtered regions fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \ 'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt' regions, _ = read_table.read_table_rows(fn_regions, '\t') regions_overlapping = {} # TODO does this actually ensure that regions are sorted appropriately # in fasta headers below? region_keys_ordered = sorted(regions.keys(), key=lambda x: int(x[1:])) for region in region_keys_ordered: if regions[region]['chromosome'] == chrm and \