Exemple #1
0
def write_region_alignment(headers, seqs, fn, start, end, master_ind):

    relative_start = max(0, index_ignoring_gaps(seqs[master_ind], start, 0))
    relative_end = index_ignoring_gaps(seqs[master_ind], end, 0)

    region_seqs = [seq[relative_start:relative_end + 1] for seq in seqs]

    write_fasta.write_fasta(headers, region_seqs, fn, gz=True)
def mask(fn, masked_fn, intervals_fn):
    headers, seqs = read_fasta.read_fasta(fn)
    seq = list(seqs[0])
    intervals = read_intervals(intervals_fn)
    for start, end in intervals:
        for i in range(start, end + 1):
            seq[i] = gp.unsequenced_symbol
    seq = ''.join(seq)
    write_fasta.write_fasta(headers, [seq], masked_fn)
                r.append(','.join(pidents[k]))
        r = tuple(r)
        all_rankings[best_key].append(r)

    # write reference genes and paralogs and all introgressed
    # genes to file and then align
    fn = gp.analysis_out_dir_absolute + tag + '/paralogs/' + \
        gene + gp.fasta_suffix
    headers = ['S288c ' + gene, 'CBS432 ' + gene,
               'S288c ' + paralog, 'CBS432 ' + paralog]
    seqs = [cer_seq.lower(), par_seq.lower(),
            cer_paralog_seq.lower(), par_paralog_seq.lower()]
    for strain in strain_intd_seqs:
        headers.append(strain + ' ' + gene)
        seqs.append(strain_intd_seqs[strain])
    write_fasta.write_fasta(headers, seqs, fn)

    aligned_fn = fn.replace(gp.fasta_suffix, gp.alignment_suffix)
    cmd_string = gp.mafft_install_path + '/mafft ' + \
        ' --quiet --reorder --preservecase ' + \
        fn + ' > ' + aligned_fn
    os.system(cmd_string)

f = open('check_paralogs_out.tsv', 'w')
f.write('category\tnum_total_genes\tnum_unique_genes\n')
for key in keys:
    f.write(key + '\t')
    f.write(str(len(all_rankings[key])) + '\t')
    num_unique_genes = len(set([x[0] for x in all_rankings[key]]))
    f.write(str(num_unique_genes) + '\n')
print('writing all gene sequences to file')
keys = sorted(strain_gene_seqs.keys())
headers = [
    key + ' ' + strain_gene_seqs[key][0] + ' ' + strain_gene_seqs[key][-1]
    for key in keys
]
seqs = [strain_gene_seqs[key][1] for key in keys]
strains = [ref] + keys
headers = [ref + ' ' + gene + ' ' + ref_strand] + headers
seqs = [ref_gene_seq] + seqs
gene_seqs_fn = gp.analysis_out_dir_absolute + tag + '/genes/' + gene + '/' + \
               gene + gp.fasta_suffix
if not os.path.isdir(gp.analysis_out_dir_absolute + tag + '/genes/' + gene):
    os.makedirs(gp.analysis_out_dir_absolute + tag + '/genes/' + gene)
write_fasta.write_fasta(headers, seqs, gene_seqs_fn)

suffixes = ['', '_filtered']
for suffix in suffixes:
    print(' '.join(['finding', suffix, 'regions that overlap gene']))
    # read in filtered regions
    fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \
        'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt'
    regions, _ = read_table.read_table_rows(fn_regions, '\t')

    # figure out which strains are introgressed/which regions overlap gene
    fn_genes_regions = gp.analysis_out_dir_absolute + tag + '/' + \
        'genes_for_each_region_chr' + chrm + '_' + tag + '.txt'
    region_to_genes = \
        gene_predictions.read_genes_for_each_region_summary(fn_genes_regions)
    # strains = [x[0] for x in s]
print('writing all range sequences to file')
keys = sorted(strain_range_seqs.keys())
headers = [
    key + ' ' + str(strain_range_seqs[key][1]) + ':' +
    str(strain_range_seqs[key][2]) for key in keys
]
seqs = [strain_range_seqs[key][0] for key in keys]
strains = [ref] + keys
headers = [ref + ' ' + str(start) + ':' + str(end)] + headers
seqs = [ref_range_seq] + seqs
label = 'chr' + chrm + '_' + str(start) + '-' + str(end)
range_seqs_fn = gp.analysis_out_dir_absolute + tag + '/ranges/' + \
                 label + '/' + label + gp.fasta_suffix
if not os.path.isdir(gp.analysis_out_dir_absolute + tag + '/ranges/' + label):
    os.makedirs(gp.analysis_out_dir_absolute + tag + '/ranges/' + label)
write_fasta.write_fasta(headers, seqs, range_seqs_fn)

suffixes = ['', '_filtered']
for suffix in suffixes:
    print(' '.join(['finding', suffix, 'regions that overlap range']))
    # read in filtered regions
    fn_regions = gp.analysis_out_dir_absolute + tag + '/' + \
        'introgressed_blocks' + suffix + '_par_' + tag + '_summary_plus.txt'
    regions, _ = read_table.read_table_rows(fn_regions, '\t')

    regions_overlapping = {}
    # TODO does this actually ensure that regions are sorted appropriately
    # in fasta headers below?
    region_keys_ordered = sorted(regions.keys(), key=lambda x: int(x[1:]))
    for region in region_keys_ordered:
        if regions[region]['chromosome'] == chrm and \