def test_flatten(): assert helper.flatten([]) == [] assert helper.flatten([[]]) == [] assert helper.flatten([[1]]) == [1] assert helper.flatten([[2], [1]]) == [2, 1] assert helper.flatten([[1, 2], [1]]) == [1, 2, 1]
d = diffs_per_site(seqs[keys[i]].lower(), seqs[keys[j]].lower()) if d != 'NA': num += d den += 1 if den == 0: return 'NA' return float(num) / den # read in shared regions shared_regions, _ = \ read_table.read_table_rows('shared_introgression_nonsingleton_list.txt', '\t') # read in strain dirs information s = align_helpers.get_strains(align_helpers.flatten(gp.non_ref_dirs.values())) strain_dirs = dict(s) # for each shared region: # - calculate fraction of sites that are polymorphic among introgressed strains # - for each introgressed strain, calculate: # - number of unique variants among introgressed strains (or all strains?) f = open('shared_introgression_nonsingleton_polymorphism.txt', 'w') f.write('region_number\tchromosome\tstart\tend\tpi\t' 'frac_poly\tnum_poly\tnum_total\tnum_strains\tstrain_list\n') for chrm in gp.chrms: chrom_seqs = {} for region_number in shared_regions.keys(): if shared_regions[region_number]['chromosome'] != chrm: continue
import os from align.align_helpers import get_strains, flatten import global_params as gp # get all non-reference strains of cerevisiae and paradoxus s = get_strains(flatten(gp.non_ref_dirs.values())) gp_dir = '../' a = [] if gp.resume_alignment: a = os.listdir(gp_dir + gp.alignments_dir) # need to add this on the start of each command because os.system() # creates a new shell instance every time cmd_string_start = 'export MUGSY_INSTALL=' + gp.mugsy_install_path + '; ' cmd_string_start += 'export PATH=$PATH:$MUGSY_INSTALL:$MUGSY_INSTALL/mapping; ' cmd_string_start += 'export PERL5LIB=$MUGSY_INSTALL/perllibs; ' ref_prefix = '_'.join(gp.alignment_ref_order) + '_' ref_dirs = [gp.ref_dir[ref] for ref in gp.alignment_ref_order] for strain, d in s: print(strain) cmd_string = cmd_string_start for chrm in [gp.chrms[-1]]: align_fn = ref_prefix + strain + '_chr' + chrm + gp.alignment_suffix # if we don't already have an alignment for this strain/chromosome, # then make one if align_fn not in a:
# region_start = 787000 # region_end = 794000 # chrm = 'II' region_start = 917571 - 100 region_end = 921647 + 100 chrm = 'IV' region_length = region_end - region_start + 1 # ====== # get strains # ====== strain_dirs = align_helpers.get_strains( align_helpers.flatten(gp.non_ref_dirs.values())) num_strains = len(strain_dirs) # ====== # loop through all strains, getting appropriate sequence # ====== # master reference and other reference seqs master_ref = gp.alignment_ref_order[0] master_fn = gp.ref_dir[master_ref] + gp.ref_fn_prefix[master_ref] + '_chr' + \ chrm + gp.fasta_suffix master_seq = read_fasta.read_fasta(master_fn)[1][0][ region_start:region_end+1].lower()