def alignCounter(ifile): #print ifile; inseqs = core.fastaGetDict(ifile); num_seqs = len(inseqs); tot_pos = 0; seq_len = 0; inv_sites = 0; var_sites = 0; gaps = 0; gap_sites = 0; gap_dist = {}; for x in xrange(num_seqs): gap_dist[x+1] = 0; j = 0; for seq in inseqs: if j == 0: j = j + 1; seq_len = len(inseqs[seq]); tot_pos = tot_pos + len(inseqs[seq]); for col in xrange(seq_len): site = []; for seq in inseqs: site.append(inseqs[seq][col]); gaps = gaps + site.count("-"); if site.count("-") != len(site): for base in site: if base != "-": if site.count(base) == (len(site) - site.count("-")): inv_sites = inv_sites + 1; else: var_sites = var_sites + 1; break; if "-" in site: gap_sites = gap_sites + 1; gap_dist[site.count("-")] = gap_dist[site.count("-")] + 1; if disp_file == 1: print ifile + "\t" + str(num_seqs) + "\t" + str(tot_pos) + "\t" + str(seq_len) + "\t" + str(inv_sites) + "\t" + str(var_sites) + "\t" + str(gaps) + "\t" + str(gap_sites) + "\t" + str(gap_dist); return num_seqs,tot_pos,seq_len,inv_sites,var_sites,gaps,gap_sites,gap_dist;
disp_file = sys.argv[2] if disp_file not in ["0", "1"]: print "Not printing file counts." disp_file = 0 disp_file = int(disp_file) print "=======================================================================" print "\t\t\t" + core.getDateTime() print "Counting the total number of positions (AAs or NTs) in:\t" + ins if os.path.isfile(ins): if disp_file == 1: print "----------" print "Sequence\tLength" inseqs = core.fastaGetDict(ins) tot_pos = 0 for seq in inseqs: if disp_file == 1: print seq + "\t" + str(len(inseqs[seq])) tot_pos = tot_pos + len(inseqs[seq]) print "----------" print "Total sequences:\t" + str(len(inseqs)) print "Total positions:\t" + str(tot_pos) print "=======================================================================" else: if not ins.endswith("/"): ins = ins + "/" filelist = os.listdir(ins)
continue if fileflag == 1: infilename = each if each.find("/") != -1: gb_outfile = each[each.rfind("/") + 1:each.index(".fa")] + "-gb.fa" else: gb_outfile = each[:each.index(".fa")] + "-gb.fa" else: infilename = indir + each gb_outfile = each[:each.index(".fa")] + "-gb.fa" gb_cmd = "gblocks " + infilename + " -t=" + seqtype if m == 1: inseqs = core.fastaGetDict(infilename) seqlen = len(inseqs[inseqs.keys()[0]]) b1 = int(round(0.5 * len(inseqs))) + 1 gb_cmd = gb_cmd + " -b1=" + str(b1) + " -b2=" + str( b1) + " -b3=" + str(seqlen) + " -b4=2 -b5=a" if v == 0: gb_cmd = gb_cmd + " >> " + gb_logfile if v == 1 or fileflag == 1: core.logCheck(l, logfilename, core.getTime() + " | GBlocks Call:\t" + gb_cmd) else: lfile = open(logfilename, "a") lfile.write(core.getTime() + " | GBlocks Call:\t" + gb_cmd + "\n")
if len(line) != 4: sys.exit(line) # Skip lines that are too short or too long rpid_to_rtid[line[2]] = line[1] core.PWS("# IDs read: " + str(len(rpid_to_rtid))) core.PWS("# ----------------") ## Get the rat IDs add_rat = True rat_ts_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.transcript.all.fa" # Sequences downloaded from Ensembl Biomart core.PWS("# " + core.getDateTime() + " Reading rat exon sequences: " + rat_ts_file) rat_ts = core.fastaGetDict(rat_ts_file) # Read the sequences rat_ts = parseHeaderIds(rat_ts) # Parse the header IDs so they only contain the exon ID. core.PWS("# Total sequences read: " + str(len(rat_ts))) core.PWS("# ----------------") add_mouse = True mouse_ts_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.transcript.all.fa" # Sequences downloaded from Ensembl Biomart core.PWS("# " + core.getDateTime() + " Reading mouse exon sequences: " + mouse_ts_file) mouse_ts = core.fastaGetDict(mouse_ts_file) # Read the sequences mouse_ts = parseHeaderIds(mouse_ts) # Parse the header IDs so they only contain the exon ID.
continue; if fileflag == 1: infilename = each; if each.find("/") != -1: gb_outfile = each[each.rfind("/")+1:each.index(".fa")] + "-gb.fa"; else: gb_outfile = each[:each.index(".fa")] + "-gb.fa"; else: infilename = indir + each; gb_outfile = each[:each.index(".fa")] + "-gb.fa"; gb_cmd = "gblocks " + infilename + " -t=" + seqtype; if m == 1: inseqs = core.fastaGetDict(infilename); seqlen = len(inseqs[inseqs.keys()[0]]); b1 = int(round(0.5 * len(inseqs))) + 1; gb_cmd = gb_cmd + " -b1=" + str(b1) + " -b2=" + str(b1) + " -b3=" + str(seqlen) + " -b4=2 -b5=a"; if v == 0: gb_cmd = gb_cmd + " >> " + gb_logfile; if v == 1 or fileflag == 1: core.logCheck(l, logfilename, core.getTime() + " | GBlocks Call:\t" + gb_cmd); else: lfile = open(logfilename, "a"); lfile.write(core.getTime() + " | GBlocks Call:\t" + gb_cmd + "\n"); lfile.close(); os.system(gb_cmd);
if counter % 10 == 0: print(counter, "/", num_files) #print(counter) counter += 1 pid = f.split("-")[0].replace(".fa", "") #if pid != "ENSMUSP00000021056": # continue; # Get the protein id by splitting the file name by - and removing the extension. aa_file = os.path.join(args.aa_dir, f) cds_file = os.path.join(args.cds_dir, pid + ".fa") outfilename = os.path.join(args.outdir, pid + "-mafft-cds.fa") # Assign the file names for the input AA, NT, and output NT sequences. aa_seqs = core.fastaGetDict(aa_file) cds_seqs = core.fastaGetDict(cds_file) # Read the sequences for the input AA and NT files. ##### # if prequal_dir: # filter_sites = {}; # prequal_file = os.path.join(prequal_dir, "logs", pid + ".fa.detail"); # for line in open(prequal_file): # #print(line); # if line[0] == "#": # continue; # if line[0] == ">": # cur_sample = line.strip(); # filter_sites[cur_sample] = []; # continue;
disp_file = sys.argv[2]; if disp_file not in ["0","1"]: print "Not printing file counts."; disp_file = 0; disp_file = int(disp_file); print "======================================================================="; print "\t\t\t" + core.getDateTime(); print "Counting the total number of positions (AAs or NTs) in:\t" + ins; if os.path.isfile(ins): if disp_file == 1: print "----------"; print "Sequence\tLength"; inseqs = core.fastaGetDict(ins); tot_pos = 0; for seq in inseqs: if disp_file == 1: print seq + "\t" + str(len(inseqs[seq])); tot_pos = tot_pos + len(inseqs[seq]); print "----------"; print "Total sequences:\t" + str(len(inseqs)); print "Total positions:\t" + str(tot_pos); print "======================================================================="; else: if not ins.endswith("/"): ins = ins + "/"; filelist = os.listdir(ins);
########################################################### import sys, os, core, math, argparse, subprocess, multiprocessing as mp ########################################################### asmdir = "../01-Assembly-data/10-Varcall/" outfilename = "logs/count-ns.csv" with open(outfilename, "w") as outfile: headers = ["sample", "contig", "length", "Ns", "hets", "softmasked"] outfile.write(",".join(headers) + "\n") for sample in os.listdir(asmdir): print("# Reading sample:", sample) asmfile = os.path.join(asmdir, sample, sample + "-iupac-consensus.fa") contigs = core.fastaGetDict(asmfile) print("# ", len(contigs), "contigs read.") for contig in contigs: #print(contig); sample_dict = { contig: { 'Ns': 0, 'hets': 0, 'softmasked': 0 } } seq = contigs[contig] ns = seq.count("N") + seq.count("n")
print "# Note: The script will skip any lines that do not have all species." print "# -------------------------------------" print "# " + core.getTime() + " Preparing species dictionary..." specdict = {} for each in speclist: current = each.split(":") specdict[current[0]] = current[1] #print specdict; print "# -------------------------------------" print "# " + core.getTime( ) + " Reading peptide source files and extracting protein IDs..." tmp_seq_dict = {} for spec in specdict: tmp_seq_dict[spec] = core.fastaGetDict(os.path.join( seqdir, specdict[spec])) main_seq_dict = {} for spec in tmp_seq_dict: main_seq_dict[spec] = {} for title in tmp_seq_dict[spec]: new_title = title[1:title.index(" ")] main_seq_dict[spec][new_title] = tmp_seq_dict[spec][title] del tmp_seq_dict print "# -------------------------------------" count = core.getFileLen(infilename) print "# " + core.getTime() + " Combining", count, "orthologs..." i = 0
def convCheck(cur_c, c, number_specs, d, ins, outs): # cur_c = 0; init_c = cur_c+1; while cur_c < c: #if c > 1: spec_list = all_specs.values(); rep_specs = []; while len(rep_specs) < number_specs: r = random.choice(spec_list); rep_specs.append(r); spec_list.remove(r); outfilename = outs + "_" + str(cur_c+1) + ".txt"; outfile = open(outfilename, "w"); outfile.write("# ==============================================================================================\n"); outfile.write("# \t\t\tConvergence testing\n"); outfile.write("# \t\t\t" + core.getDateTime() + "\n"); outfile.write("# Using alignments in:\t\t" + indir + "\n"); outfile.write("# Randomly choosing " + str(number_specs) + " species and performing " + str(c) + " replicate tests for convergence.\n"); outfile.write("# This is replicate number " + str(cur_c+1) + "\n"); outfile.write("# Writing output to:\t\t\t" + outfilename + "\n"); if d == 0: outfile.write("# Checking for convergent sites.\n"); elif d == 1: outfile.write("# Checking for divergent sites.\n"); outfile.write("# Using species:\t" + ",".join(rep_specs)); outfile.write("# ---------------------------------------------\n"); #sys.exit(); #cur_c = cur_c + 1; #continue; aligns = os.listdir(ins); numbars = 0; donepercent = []; count = len(aligns); i = 0; numsites = 0; totgenes = 0; outfile.write("# " + core.getTime() + " Starting Scan...\n"); outfile.write("# Site#\tTargetSpecs\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n"); for align in aligns: #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars); i = i + 1; if align.find(".fa") == -1: continue; infilename = os.path.join(ins, align); gid = "_".join(align.split("_")[:2]); chrome = align[align.find("chr"):align.find("chr")+4] inseqs = core.fastaGetDict(infilename); for t1 in rep_specs: for t2 in rep_specs: if t1 == t2: continue; targets = [t1, t2]; backgrounds = [spec for spec in rep_specs if spec not in targets]; num_targets_present = 0; num_bg_present = 0; for title in inseqs: if any(t in title for t in targets): num_targets_present = num_targets_present + 1; if any(b in title for b in backgrounds): num_bg_present = num_bg_present + 1; if num_targets_present == len(targets) and num_bg_present == len(backgrounds): # print "The following gene has all target and background species and will be checked:\t\t" + gid; totgenes = totgenes + 1; seqlen = len(inseqs[inseqs.keys()[0]]); # print "Alignment length\t\t", seqlen; t_alleles = {}; b_alleles = {}; for x in xrange(len(inseqs[inseqs.keys()[0]])): for title in inseqs: cur_spec = title[1:].replace("\n",""); if cur_spec in targets: t_alleles[cur_spec] = inseqs[title][x]; if cur_spec in backgrounds: b_alleles[cur_spec] = inseqs[title][x]; t_states = t_alleles.values(); #t_gap = t_states.count("-"); #t_missing = t_states.count("X"); #t_stop = t_states.count("*"); b_states = b_alleles.values(); #b_gap = b_states.count("-"); #b_missing = b_states.count("X"); #b_stop = b_states.count("*"); t_final = remGapMiss(t_states); b_final = remGapMiss(b_states); if t_final == [] or b_final == []: continue; if d == 0: if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) == len(t_final) and t_final[0] not in b_final: numsites = numsites + 1; print core.getTime() + " Convergent site found!"; print "Filename:\t\t" + align; print "Chromosome:\t\t" + chrome; print "Gene ID:\t\t" + gid; print "Alignment length\t", seqlen; print "Target alleles:\t\t" + "".join(t_final); print "Background alleles:\t" + "".join(b_final); print "---------------"; outline = str(numsites) + "\t" + ",".join(targets) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n"; outfile.write(outline); elif d == 1: if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) != len(t_final) and b_final.count(b_final[0]) == len(b_final): if not any(t in b_final for t in t_final): numsites = numsites + 1; # print "\nDivergent site found!"; # print "Filename:\t\t" + align; # print "Chromosome:\t\t" + chrome; # print "Gene ID:\t\t" + gid; # print "Alignment length\t", seqlen; # print t_final; # print b_final; outline = str(numsites) + "\t" + ",".join(targets) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n"; outfile.write(outline); #pstring = "100.0% complete."; #sys.stderr.write('\b' * len(pstring) + pstring); outfile.write("\n# " + core.getTime() + " Done!\n"); outfile.write("# Total sites found: " + str(numsites) + "\n"); outfile.write("# Total genes checked: " + str(totgenes) + "\n"); outfile.write("# =============================================================================================="); cur_c = cur_c + 1; if ropt != 0: print core.getTime() + " Replicates", init_c, "to", c, "complete.";
#!/usr/bin/python ############################################################ # Compares a couple of exome assemblies ############################################################ import sys, os, core, coreseq, argparse ############################################################ a1 = sys.argv[1] a2 = sys.argv[2] print("Reading assembly: " + a1) a1seq = core.fastaGetDict(a1) print("Read " + str(len(a1seq)) + " contigs.") print("Reading assembly: " + a2) a2seq = core.fastaGetDict(a2) print("Read " + str(len(a2seq)) + " contigs.") print("------------") print("Counting identical contigs...") ident = [] for t1 in a1seq: s1 = a1seq[t1] for t2 in a2seq: s2 = a2seq[t2] if s1 == s2: ident.append(t1 + "-" + t2)
#print(f); cur_out = {h: "NA" for h in aln_headers} cur_out["align"] = f # Initialize the current output dictionary. cur_infile = os.path.join(args.input, f) if not args.count_only: cur_nt_outfile = os.path.join(nt_outdir, f.replace(".fa", ".filter.fa")) cur_aa_outfile = os.path.join(aa_outdir, f.replace(".fa", ".filter.fa")) # Get the current in and output files seqs_orig = core.fastaGetDict(cur_infile) seqs = {t: seqs_orig[t].upper() for t in seqs_orig} samples = list(seqs.keys()) pre_samples += len(samples) # Read the sequences for sample in seqs: if sample not in sample_stats: sample_stats[sample] = { col: 0 for col in sample_headers if col != "sample" } sample_stats[sample]['num alns'] += 1 # Count the samples in the alignment in the main dict and initialize if it is the first time this sample is seen
def convCheck(cur_c, c, ropt, targets, backgrounds, d, ins, outs): # cur_c = 0; init_c = cur_c+1; while cur_c < c: #if c > 1: if ropt != 0: outfilename = outs + "_" + str(cur_c+1) + ".txt"; else: outfilename = outs + ".txt"; if ropt != 0: #backgrounds = ["hg19", "rheMac3", "calJac3", "mm10", "rn5", "vicPac2", "bosTau7", "canFam3", "loxAfr3", "papHam1", "monDom5"]; backgrounds = []; cur_r = len(backgrounds); while cur_r < ropt: chosenspec = random.choice(all_specs.values()); if chosenspec not in targets and chosenspec not in backgrounds: backgrounds.append(chosenspec); cur_r = cur_r + 1; outfile = open(outfilename, "w"); outfile.write("# ==============================================================================================\n"); outfile.write("# \t\t\tConvergence testing\n"); outfile.write("# \t\t\t" + core.getDateTime() + "\n"); outfile.write("# Using alignments in:\t\t" + indir + "\n"); outfile.write("# Target species:\t\t\t" + ", ".join(targets) + "\n"); if ropt != 0: outfile.write("# Randomly choosing " + str(r) + " background species and performing " + str(c) + " replicate tests for convergence.\n"); outfile.write("# This is replicate number " + str(cur_c+1) + "\n"); outfile.write("# Background species:\t\t" + ", ".join(backgrounds) + "\n"); outfile.write("# Writing output to:\t\t\t" + outfilename + "\n"); if d == 0: outfile.write("# Checking for convergent sites.\n"); elif d == 1: outfile.write("# Checking for divergent sites.\n"); outfile.write("# ---------------------------------------------\n"); #sys.exit(); #cur_c = cur_c + 1; #continue; aligns = os.listdir(ins); numbars = 0; donepercent = []; count = len(aligns); i = 0; numsites = 0; totgenes = 0; outfile.write("# " + core.getTime() + " Starting Scan...\n"); outfile.write("# Site#\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n"); for align in aligns: #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars); i = i + 1; if align.find(".fa") == -1: continue; #if i > 25: # break; infilename = ins + align; #print align; gid = "_".join(align.split("_")[:2]); chrome = align[align.find("chr"):align.find("chr")+4] inseqs = core.fastaGetDict(infilename); num_targets_present = 0; num_bg_present = 0; for title in inseqs: if any(t in title for t in targets): num_targets_present = num_targets_present + 1; if any(b in title for b in backgrounds): num_bg_present = num_bg_present + 1; if num_targets_present == len(targets) and num_bg_present == len(backgrounds): #print "The following gene has all target and background species and will be checked:\t\t" + gid; totgenes = totgenes + 1; seqlen = len(inseqs[inseqs.keys()[0]]); #print "Alignment length\t\t", seqlen; t_alleles = {}; b_alleles = {}; for x in xrange(len(inseqs[inseqs.keys()[0]])): for title in inseqs: for t in targets: if t in title: t_alleles[t] = inseqs[title][x]; for b in backgrounds: if b in title: b_alleles[b] = inseqs[title][x]; t_states = t_alleles.values(); t_gap = t_states.count("-"); t_missing = t_states.count("X"); t_stop = t_states.count("*"); b_states = b_alleles.values(); b_gap = b_states.count("-"); b_missing = b_states.count("X"); b_stop = b_states.count("*"); t_final = remGapMiss(t_states); b_final = remGapMiss(b_states); #print t_alleles; #print t_states; #print t_gap; #print t_missing; #print t_stop; #print t_final; #print b_alleles; #print b_states; #print b_gap; #print b_missing; #print b_stop; #print b_final; if t_final == [] or b_final == []: continue; if d == 0: if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) == len(t_final) and t_final[0] not in b_final: numsites = numsites + 1; #print core.getTime() + " Convergent site found!"; #print "Filename:\t\t" + align; #print "Chromosome:\t\t" + chrome; #print "Gene ID:\t\t" + gid; #print "Alignment length\t", seqlen; #print "Target alleles:\t\t" + "".join(t_final); #print "Background alleles:\t" + "".join(b_final); #print "---------------"; outline = str(numsites) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n"; outfile.write(outline); #sys.exit(); elif d == 1: if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) != len(t_final) and b_final.count(b_final[0]) == len(b_final): if not any(t in b_final for t in t_final): numsites = numsites + 1; #print "\nDivergent site found!"; #print "Filename:\t\t" + align; #print "Chromosome:\t\t" + chrome; #print "Gene ID:\t\t" + gid; #print "Alignment length\t", seqlen; #print t_final; #print b_final; outline = str(numsites) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n"; outfile.write(outline); #pstring = "100.0% complete."; #sys.stderr.write('\b' * len(pstring) + pstring); outfile.write("\n# " + core.getTime() + " Done!\n"); outfile.write("# Total sites found: " + str(numsites) + "\n"); outfile.write("# Total genes checked: " + str(totgenes) + "\n"); outfile.write("# =============================================================================================="); cur_c = cur_c + 1; if ropt != 0: print core.getTime() + " Replicates", init_c, "to", c, "complete.";
def convCheck(cur_c, c, number_specs, d, ins, outs): # cur_c = 0; init_c = cur_c + 1 while cur_c < c: #if c > 1: spec_list = all_specs.values() rep_specs = [] while len(rep_specs) < number_specs: r = random.choice(spec_list) rep_specs.append(r) spec_list.remove(r) outfilename = outs + "_" + str(cur_c + 1) + ".txt" outfile = open(outfilename, "w") outfile.write( "# ==============================================================================================\n" ) outfile.write("# \t\t\tConvergence testing\n") outfile.write("# \t\t\t" + core.getDateTime() + "\n") outfile.write("# Using alignments in:\t\t" + indir + "\n") outfile.write("# Randomly choosing " + str(number_specs) + " species and performing " + str(c) + " replicate tests for convergence.\n") outfile.write("# This is replicate number " + str(cur_c + 1) + "\n") outfile.write("# Writing output to:\t\t\t" + outfilename + "\n") if d == 0: outfile.write("# Checking for convergent sites.\n") elif d == 1: outfile.write("# Checking for divergent sites.\n") outfile.write("# Using species:\t" + ",".join(rep_specs)) outfile.write("# ---------------------------------------------\n") #sys.exit(); #cur_c = cur_c + 1; #continue; aligns = os.listdir(ins) numbars = 0 donepercent = [] count = len(aligns) i = 0 numsites = 0 totgenes = 0 outfile.write("# " + core.getTime() + " Starting Scan...\n") outfile.write( "# Site#\tTargetSpecs\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n" ) for align in aligns: #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars); i = i + 1 if align.find(".fa") == -1: continue infilename = os.path.join(ins, align) gid = "_".join(align.split("_")[:2]) chrome = align[align.find("chr"):align.find("chr") + 4] inseqs = core.fastaGetDict(infilename) for t1 in rep_specs: for t2 in rep_specs: if t1 == t2: continue targets = [t1, t2] backgrounds = [ spec for spec in rep_specs if spec not in targets ] num_targets_present = 0 num_bg_present = 0 for title in inseqs: if any(t in title for t in targets): num_targets_present = num_targets_present + 1 if any(b in title for b in backgrounds): num_bg_present = num_bg_present + 1 if num_targets_present == len( targets) and num_bg_present == len(backgrounds): # print "The following gene has all target and background species and will be checked:\t\t" + gid; totgenes = totgenes + 1 seqlen = len(inseqs[inseqs.keys()[0]]) # print "Alignment length\t\t", seqlen; t_alleles = {} b_alleles = {} for x in xrange(len(inseqs[inseqs.keys()[0]])): for title in inseqs: cur_spec = title[1:].replace("\n", "") if cur_spec in targets: t_alleles[cur_spec] = inseqs[title][x] if cur_spec in backgrounds: b_alleles[cur_spec] = inseqs[title][x] t_states = t_alleles.values() #t_gap = t_states.count("-"); #t_missing = t_states.count("X"); #t_stop = t_states.count("*"); b_states = b_alleles.values() #b_gap = b_states.count("-"); #b_missing = b_states.count("X"); #b_stop = b_states.count("*"); t_final = remGapMiss(t_states) b_final = remGapMiss(b_states) if t_final == [] or b_final == []: continue if d == 0: if len(t_final) == len(targets) and len( b_final ) == len(backgrounds) and t_final.count( t_final[0]) == len( t_final ) and t_final[0] not in b_final: numsites = numsites + 1 print core.getTime( ) + " Convergent site found!" print "Filename:\t\t" + align print "Chromosome:\t\t" + chrome print "Gene ID:\t\t" + gid print "Alignment length\t", seqlen print "Target alleles:\t\t" + "".join( t_final) print "Background alleles:\t" + "".join( b_final) print "---------------" outline = str(numsites) + "\t" + ",".join( targets ) + "\t" + chrome + "\t" + gid + "\t" + str( seqlen) + "\t" + str( x + 1) + "\t" + "".join( t_final) + "\t" + "".join( b_final) + "\n" outfile.write(outline) elif d == 1: if len(t_final) == len(targets) and len( b_final ) == len(backgrounds) and t_final.count( t_final[0]) != len( t_final) and b_final.count( b_final[0]) == len(b_final): if not any(t in b_final for t in t_final): numsites = numsites + 1 # print "\nDivergent site found!"; # print "Filename:\t\t" + align; # print "Chromosome:\t\t" + chrome; # print "Gene ID:\t\t" + gid; # print "Alignment length\t", seqlen; # print t_final; # print b_final; outline = str( numsites ) + "\t" + ",".join( targets ) + "\t" + chrome + "\t" + gid + "\t" + str( seqlen) + "\t" + str( x + 1) + "\t" + "".join( t_final) + "\t" + "".join( b_final) + "\n" outfile.write(outline) #pstring = "100.0% complete."; #sys.stderr.write('\b' * len(pstring) + pstring); outfile.write("\n# " + core.getTime() + " Done!\n") outfile.write("# Total sites found: " + str(numsites) + "\n") outfile.write("# Total genes checked: " + str(totgenes) + "\n") outfile.write( "# ==============================================================================================" ) cur_c = cur_c + 1 if ropt != 0: print core.getTime() + " Replicates", init_c, "to", c, "complete."
num_fixed = 0 i = 1 for s in specs: s_mod = s.replace(" ", "-") print(i, " ", s_mod) i += 1 ref = os.path.join(ref_dir, s_mod, s_mod + "-referee-corrected.fa") assert os.path.isfile(ref), "\nAssembly file not found: " + ref # Get reference new_ref = os.path.join(ref_dir, s_mod, s_mod + "-referee-corrected-RMSCAFF.fa") print("reading ref: " + ref) seqs = core.fastaGetDict(ref) print("scaffolds read ", len(seqs)) print("counting scaffolds") title_counts = defaultdict(int) exclude = [] for title in seqs: t = title.split(" ")[0] title_counts[t] += 1 if title_counts[t] > 1: exclude.append(title) if exclude != []: print("duplicate scaffold found. writing new output: " + new_ref) with open(new_ref, "w") as outfile: for title in seqs:
def convCheck(cur_c, c, ropt, targets, backgrounds, d, ins, outs): # cur_c = 0; init_c = cur_c + 1 while cur_c < c: #if c > 1: if ropt != 0: outfilename = outs + "_" + str(cur_c + 1) + ".txt" else: outfilename = outs + ".txt" if ropt != 0: #backgrounds = ["hg19", "rheMac3", "calJac3", "mm10", "rn5", "vicPac2", "bosTau7", "canFam3", "loxAfr3", "papHam1", "monDom5"]; backgrounds = [] cur_r = len(backgrounds) while cur_r < ropt: chosenspec = random.choice(all_specs.values()) if chosenspec not in targets and chosenspec not in backgrounds: backgrounds.append(chosenspec) cur_r = cur_r + 1 outfile = open(outfilename, "w") outfile.write( "# ==============================================================================================\n" ) outfile.write("# \t\t\tConvergence testing\n") outfile.write("# \t\t\t" + core.getDateTime() + "\n") outfile.write("# Using alignments in:\t\t" + indir + "\n") outfile.write("# Target species:\t\t\t" + ", ".join(targets) + "\n") if ropt != 0: outfile.write("# Randomly choosing " + str(r) + " background species and performing " + str(c) + " replicate tests for convergence.\n") outfile.write("# This is replicate number " + str(cur_c + 1) + "\n") outfile.write("# Background species:\t\t" + ", ".join(backgrounds) + "\n") outfile.write("# Writing output to:\t\t\t" + outfilename + "\n") if d == 0: outfile.write("# Checking for convergent sites.\n") elif d == 1: outfile.write("# Checking for divergent sites.\n") outfile.write("# ---------------------------------------------\n") #sys.exit(); #cur_c = cur_c + 1; #continue; aligns = os.listdir(ins) numbars = 0 donepercent = [] count = len(aligns) i = 0 numsites = 0 totgenes = 0 outfile.write("# " + core.getTime() + " Starting Scan...\n") outfile.write( "# Site#\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n" ) for align in aligns: #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars); i = i + 1 if align.find(".fa") == -1: continue #if i > 25: # break; infilename = ins + align #print align; gid = "_".join(align.split("_")[:2]) chrome = align[align.find("chr"):align.find("chr") + 4] inseqs = core.fastaGetDict(infilename) num_targets_present = 0 num_bg_present = 0 for title in inseqs: if any(t in title for t in targets): num_targets_present = num_targets_present + 1 if any(b in title for b in backgrounds): num_bg_present = num_bg_present + 1 if num_targets_present == len(targets) and num_bg_present == len( backgrounds): #print "The following gene has all target and background species and will be checked:\t\t" + gid; totgenes = totgenes + 1 seqlen = len(inseqs[inseqs.keys()[0]]) #print "Alignment length\t\t", seqlen; t_alleles = {} b_alleles = {} for x in xrange(len(inseqs[inseqs.keys()[0]])): for title in inseqs: for t in targets: if t in title: t_alleles[t] = inseqs[title][x] for b in backgrounds: if b in title: b_alleles[b] = inseqs[title][x] t_states = t_alleles.values() t_gap = t_states.count("-") t_missing = t_states.count("X") t_stop = t_states.count("*") b_states = b_alleles.values() b_gap = b_states.count("-") b_missing = b_states.count("X") b_stop = b_states.count("*") t_final = remGapMiss(t_states) b_final = remGapMiss(b_states) #print t_alleles; #print t_states; #print t_gap; #print t_missing; #print t_stop; #print t_final; #print b_alleles; #print b_states; #print b_gap; #print b_missing; #print b_stop; #print b_final; if t_final == [] or b_final == []: continue if d == 0: if len(t_final) == len(targets) and len( b_final) == len(backgrounds) and t_final.count( t_final[0]) == len( t_final) and t_final[0] not in b_final: numsites = numsites + 1 #print core.getTime() + " Convergent site found!"; #print "Filename:\t\t" + align; #print "Chromosome:\t\t" + chrome; #print "Gene ID:\t\t" + gid; #print "Alignment length\t", seqlen; #print "Target alleles:\t\t" + "".join(t_final); #print "Background alleles:\t" + "".join(b_final); #print "---------------"; outline = str( numsites ) + "\t" + chrome + "\t" + gid + "\t" + str( seqlen) + "\t" + str(x + 1) + "\t" + "".join( t_final) + "\t" + "".join(b_final) + "\n" outfile.write(outline) #sys.exit(); elif d == 1: if len(t_final) == len(targets) and len( b_final ) == len(backgrounds) and t_final.count( t_final[0]) != len(t_final) and b_final.count( b_final[0]) == len(b_final): if not any(t in b_final for t in t_final): numsites = numsites + 1 #print "\nDivergent site found!"; #print "Filename:\t\t" + align; #print "Chromosome:\t\t" + chrome; #print "Gene ID:\t\t" + gid; #print "Alignment length\t", seqlen; #print t_final; #print b_final; outline = str( numsites ) + "\t" + chrome + "\t" + gid + "\t" + str( seqlen ) + "\t" + str(x + 1) + "\t" + "".join( t_final) + "\t" + "".join(b_final) + "\n" outfile.write(outline) #pstring = "100.0% complete."; #sys.stderr.write('\b' * len(pstring) + pstring); outfile.write("\n# " + core.getTime() + " Done!\n") outfile.write("# Total sites found: " + str(numsites) + "\n") outfile.write("# Total genes checked: " + str(totgenes) + "\n") outfile.write( "# ==============================================================================================" ) cur_c = cur_c + 1 if ropt != 0: print core.getTime() + " Replicates", init_c, "to", c, "complete."