Beispiel #1
0
def alignCounter(ifile):
	#print ifile;
	inseqs = core.fastaGetDict(ifile);
	num_seqs = len(inseqs);
	tot_pos = 0;
	seq_len = 0;

	inv_sites = 0;
	var_sites = 0;
	gaps = 0;
	gap_sites = 0;
	gap_dist = {};
	for x in xrange(num_seqs):
		gap_dist[x+1] = 0;

	j = 0;

	for seq in inseqs:
		if j == 0:
			j = j + 1;
			seq_len = len(inseqs[seq]);
		tot_pos = tot_pos + len(inseqs[seq]);

	for col in xrange(seq_len):
		site = [];
		for seq in inseqs:
			site.append(inseqs[seq][col]);

		gaps = gaps + site.count("-");

		if site.count("-") != len(site):
			for base in site:
				if base != "-":
					if site.count(base) == (len(site) - site.count("-")):
						inv_sites = inv_sites + 1;
					else:
						var_sites = var_sites + 1;
					break;

		if "-" in site:
			gap_sites = gap_sites + 1;
			gap_dist[site.count("-")] = gap_dist[site.count("-")] + 1;

	if disp_file == 1:
		print ifile + "\t" + str(num_seqs) + "\t" + str(tot_pos) + "\t" + str(seq_len) + "\t" + str(inv_sites) + "\t" + str(var_sites) + "\t" + str(gaps) + "\t" + str(gap_sites) + "\t" + str(gap_dist);
	return num_seqs,tot_pos,seq_len,inv_sites,var_sites,gaps,gap_sites,gap_dist;
Beispiel #2
0
    disp_file = sys.argv[2]
if disp_file not in ["0", "1"]:
    print "Not printing file counts."
    disp_file = 0

disp_file = int(disp_file)

print "======================================================================="
print "\t\t\t" + core.getDateTime()
print "Counting the total number of positions (AAs or NTs) in:\t" + ins

if os.path.isfile(ins):
    if disp_file == 1:
        print "----------"
        print "Sequence\tLength"
    inseqs = core.fastaGetDict(ins)
    tot_pos = 0
    for seq in inseqs:
        if disp_file == 1:
            print seq + "\t" + str(len(inseqs[seq]))
        tot_pos = tot_pos + len(inseqs[seq])
    print "----------"
    print "Total sequences:\t" + str(len(inseqs))
    print "Total positions:\t" + str(tot_pos)
    print "======================================================================="

else:
    if not ins.endswith("/"):
        ins = ins + "/"
    filelist = os.listdir(ins)
Beispiel #3
0
        continue

    if fileflag == 1:
        infilename = each
        if each.find("/") != -1:
            gb_outfile = each[each.rfind("/") + 1:each.index(".fa")] + "-gb.fa"
        else:
            gb_outfile = each[:each.index(".fa")] + "-gb.fa"
    else:
        infilename = indir + each
        gb_outfile = each[:each.index(".fa")] + "-gb.fa"

    gb_cmd = "gblocks " + infilename + " -t=" + seqtype

    if m == 1:
        inseqs = core.fastaGetDict(infilename)
        seqlen = len(inseqs[inseqs.keys()[0]])
        b1 = int(round(0.5 * len(inseqs))) + 1

        gb_cmd = gb_cmd + " -b1=" + str(b1) + " -b2=" + str(
            b1) + " -b3=" + str(seqlen) + " -b4=2 -b5=a"

    if v == 0:
        gb_cmd = gb_cmd + " >> " + gb_logfile

    if v == 1 or fileflag == 1:
        core.logCheck(l, logfilename,
                      core.getTime() + " | GBlocks Call:\t" + gb_cmd)
    else:
        lfile = open(logfilename, "a")
        lfile.write(core.getTime() + " | GBlocks Call:\t" + gb_cmd + "\n")
    if len(line) != 4:
        sys.exit(line)
    # Skip lines that are too short or too long

    rpid_to_rtid[line[2]] = line[1]
core.PWS("# IDs read: " + str(len(rpid_to_rtid)))
core.PWS("# ----------------")
## Get the rat IDs

add_rat = True
rat_ts_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.transcript.all.fa"
# Sequences downloaded from Ensembl Biomart
core.PWS("# " + core.getDateTime() + " Reading rat exon sequences: " +
         rat_ts_file)
rat_ts = core.fastaGetDict(rat_ts_file)
# Read the sequences
rat_ts = parseHeaderIds(rat_ts)
# Parse the header IDs so they only contain the exon ID.
core.PWS("# Total sequences read: " + str(len(rat_ts)))
core.PWS("# ----------------")

add_mouse = True
mouse_ts_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.transcript.all.fa"
# Sequences downloaded from Ensembl Biomart
core.PWS("# " + core.getDateTime() + " Reading mouse exon sequences: " +
         mouse_ts_file)
mouse_ts = core.fastaGetDict(mouse_ts_file)
# Read the sequences
mouse_ts = parseHeaderIds(mouse_ts)
# Parse the header IDs so they only contain the exon ID.
Beispiel #5
0
		continue;

	if fileflag == 1:
		infilename = each;
		if each.find("/") != -1:
			gb_outfile = each[each.rfind("/")+1:each.index(".fa")] + "-gb.fa";
		else:
			gb_outfile = each[:each.index(".fa")] + "-gb.fa";
	else:
		infilename = indir + each;
		gb_outfile = each[:each.index(".fa")] + "-gb.fa";

	gb_cmd = "gblocks " + infilename + " -t=" + seqtype;

	if m == 1:
		inseqs = core.fastaGetDict(infilename);
		seqlen = len(inseqs[inseqs.keys()[0]]);
		b1 = int(round(0.5 * len(inseqs))) + 1;

		gb_cmd = gb_cmd + " -b1=" + str(b1) + " -b2=" + str(b1) + " -b3=" + str(seqlen) + " -b4=2 -b5=a";		

	if v == 0:
		gb_cmd = gb_cmd + " >> " + gb_logfile;

	if v == 1 or fileflag == 1:
		core.logCheck(l, logfilename, core.getTime() + " | GBlocks Call:\t" + gb_cmd);
	else:
		lfile = open(logfilename, "a");
		lfile.write(core.getTime() + " | GBlocks Call:\t" + gb_cmd + "\n");
		lfile.close();
	os.system(gb_cmd);
Beispiel #6
0
    if counter % 10 == 0:
        print(counter, "/", num_files)
    #print(counter)
    counter += 1

    pid = f.split("-")[0].replace(".fa", "")
    #if pid != "ENSMUSP00000021056":
    #    continue;
    # Get the protein id by splitting the file name by - and removing the extension.

    aa_file = os.path.join(args.aa_dir, f)
    cds_file = os.path.join(args.cds_dir, pid + ".fa")
    outfilename = os.path.join(args.outdir, pid + "-mafft-cds.fa")
    # Assign the file names for the input AA, NT, and output NT sequences.

    aa_seqs = core.fastaGetDict(aa_file)
    cds_seqs = core.fastaGetDict(cds_file)
    # Read the sequences for the input AA and NT files.

    #####
    # if prequal_dir:
    #     filter_sites = {};
    #     prequal_file = os.path.join(prequal_dir, "logs", pid + ".fa.detail");
    #     for line in open(prequal_file):
    #         #print(line);
    #         if line[0] == "#":
    #             continue;
    #         if line[0] == ">":
    #             cur_sample = line.strip();
    #             filter_sites[cur_sample] = [];
    #             continue;
Beispiel #7
0
	disp_file = sys.argv[2];
if disp_file not in ["0","1"]:
	print "Not printing file counts.";
	disp_file = 0;

disp_file = int(disp_file);

print "=======================================================================";
print "\t\t\t" + core.getDateTime();
print "Counting the total number of positions (AAs or NTs) in:\t" + ins;

if os.path.isfile(ins):
	if disp_file == 1:
		print "----------";
		print "Sequence\tLength";
	inseqs = core.fastaGetDict(ins);
	tot_pos = 0;
	for seq in inseqs:
		if disp_file == 1:
			print seq + "\t" + str(len(inseqs[seq]));
		tot_pos = tot_pos + len(inseqs[seq]);
	print "----------";
	print "Total sequences:\t" + str(len(inseqs));
	print "Total positions:\t" + str(tot_pos);
	print "=======================================================================";

else:
	if not ins.endswith("/"):
		ins = ins + "/";
	filelist = os.listdir(ins);
###########################################################

import sys, os, core, math, argparse, subprocess, multiprocessing as mp

###########################################################

asmdir = "../01-Assembly-data/10-Varcall/"
outfilename = "logs/count-ns.csv"

with open(outfilename, "w") as outfile:
    headers = ["sample", "contig", "length", "Ns", "hets", "softmasked"]
    outfile.write(",".join(headers) + "\n")
    for sample in os.listdir(asmdir):
        print("# Reading sample:", sample)
        asmfile = os.path.join(asmdir, sample, sample + "-iupac-consensus.fa")
        contigs = core.fastaGetDict(asmfile)
        print("#   ", len(contigs), "contigs read.")

        for contig in contigs:
            #print(contig);

            sample_dict = {
                contig: {
                    'Ns': 0,
                    'hets': 0,
                    'softmasked': 0
                }
            }
            seq = contigs[contig]

            ns = seq.count("N") + seq.count("n")
Beispiel #9
0
print "# Note: The script will skip any lines that do not have all species."
print "# -------------------------------------"
print "# " + core.getTime() + " Preparing species dictionary..."

specdict = {}
for each in speclist:
    current = each.split(":")
    specdict[current[0]] = current[1]

#print specdict;
print "# -------------------------------------"
print "# " + core.getTime(
) + " Reading peptide source files and extracting protein IDs..."
tmp_seq_dict = {}
for spec in specdict:
    tmp_seq_dict[spec] = core.fastaGetDict(os.path.join(
        seqdir, specdict[spec]))

main_seq_dict = {}
for spec in tmp_seq_dict:
    main_seq_dict[spec] = {}
    for title in tmp_seq_dict[spec]:
        new_title = title[1:title.index(" ")]
        main_seq_dict[spec][new_title] = tmp_seq_dict[spec][title]

del tmp_seq_dict

print "# -------------------------------------"
count = core.getFileLen(infilename)
print "# " + core.getTime() + " Combining", count, "orthologs..."

i = 0
def convCheck(cur_c, c, number_specs, d, ins, outs):
#	cur_c = 0;
	init_c = cur_c+1;
	while cur_c < c:
		#if c > 1:

		spec_list = all_specs.values();
		rep_specs = [];
		while len(rep_specs) < number_specs:
			r = random.choice(spec_list);
			rep_specs.append(r);
			spec_list.remove(r);

		outfilename = outs + "_" + str(cur_c+1) + ".txt";
		outfile = open(outfilename, "w");
		outfile.write("# ==============================================================================================\n");
		outfile.write("# \t\t\tConvergence testing\n");
		outfile.write("# \t\t\t" + core.getDateTime() + "\n");
		outfile.write("# Using alignments in:\t\t" + indir + "\n");
		outfile.write("# Randomly choosing " + str(number_specs) + " species and performing " + str(c) + " replicate tests for convergence.\n");
		outfile.write("# This is replicate number " + str(cur_c+1) + "\n");
		outfile.write("# Writing output to:\t\t\t" + outfilename + "\n");
		if d == 0:
			outfile.write("# Checking for convergent sites.\n");
		elif d == 1:
			outfile.write("# Checking for divergent sites.\n");
		outfile.write("# Using species:\t" + ",".join(rep_specs));
		outfile.write("# ---------------------------------------------\n");

		#sys.exit();
		#cur_c = cur_c + 1;
		#continue;
		aligns = os.listdir(ins);
		numbars = 0;
		donepercent = [];
		count = len(aligns);
		i = 0;
		numsites = 0;
		totgenes = 0;
		outfile.write("# " + core.getTime() + " Starting Scan...\n");
		outfile.write("# Site#\tTargetSpecs\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n");
		for align in aligns:
			#numbars, donepercent = core.loadingBar(i, count, donepercent, numbars);
			i = i + 1;
			if align.find(".fa") == -1:
				continue;

			infilename = os.path.join(ins, align);
			gid = "_".join(align.split("_")[:2]);
			chrome = align[align.find("chr"):align.find("chr")+4]

			inseqs = core.fastaGetDict(infilename);

			for t1 in rep_specs:
				for t2 in rep_specs:
					if t1 == t2:
						continue;

					targets = [t1, t2];
					backgrounds = [spec for spec in rep_specs if spec not in targets];

					num_targets_present = 0;
					num_bg_present = 0;
					for title in inseqs:
						if any(t in title for t in targets):
							num_targets_present = num_targets_present + 1;
						if any(b in title for b in backgrounds):
							num_bg_present = num_bg_present + 1;

					if num_targets_present == len(targets) and num_bg_present == len(backgrounds):
						# print "The following gene has all target and background species and will be checked:\t\t" + gid;
						totgenes = totgenes + 1;

						seqlen = len(inseqs[inseqs.keys()[0]]);
						# print "Alignment length\t\t", seqlen;

						t_alleles = {};
						b_alleles = {};

						for x in xrange(len(inseqs[inseqs.keys()[0]])):
							for title in inseqs:
								cur_spec = title[1:].replace("\n","");
								if cur_spec in targets:
									t_alleles[cur_spec] = inseqs[title][x];
								if cur_spec in backgrounds:
									b_alleles[cur_spec] = inseqs[title][x];

							t_states = t_alleles.values();
							#t_gap = t_states.count("-");
							#t_missing = t_states.count("X");
							#t_stop = t_states.count("*");

							b_states = b_alleles.values();
							#b_gap = b_states.count("-");
							#b_missing = b_states.count("X");
							#b_stop = b_states.count("*");

							t_final = remGapMiss(t_states);
							b_final = remGapMiss(b_states);

							if t_final == [] or b_final == []:
								continue;

							if d == 0:
								if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) == len(t_final) and t_final[0] not in b_final:
									numsites = numsites + 1;
									print core.getTime() + " Convergent site found!";
									print "Filename:\t\t" + align;
									print "Chromosome:\t\t" + chrome;
									print "Gene ID:\t\t" + gid;
									print "Alignment length\t", seqlen;
									print "Target alleles:\t\t" + "".join(t_final);
									print "Background alleles:\t" + "".join(b_final);
									print "---------------";
									outline = str(numsites) + "\t" + ",".join(targets) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";
									outfile.write(outline);

							elif d == 1:
								if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) != len(t_final) and b_final.count(b_final[0]) == len(b_final):
									if not any(t in b_final for t in t_final):
										numsites = numsites + 1;
										# print "\nDivergent site found!";
										# print "Filename:\t\t" + align;
										# print "Chromosome:\t\t" + chrome;
										# print "Gene ID:\t\t" + gid;
										# print "Alignment length\t", seqlen;
										# print t_final;
										# print b_final;
										outline = str(numsites) + "\t" + ",".join(targets) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";
										outfile.write(outline);

		#pstring = "100.0% complete.";
		#sys.stderr.write('\b' * len(pstring) + pstring);
		outfile.write("\n# " + core.getTime() + " Done!\n");
		outfile.write("# Total sites found: " + str(numsites) + "\n");
		outfile.write("# Total genes checked: " + str(totgenes) + "\n");
		outfile.write("# ==============================================================================================");
		cur_c = cur_c + 1;
	if ropt != 0:
		print core.getTime() + " Replicates", init_c, "to", c, "complete.";
Beispiel #11
0
#!/usr/bin/python
############################################################
# Compares a couple of exome assemblies
############################################################

import sys, os, core, coreseq, argparse

############################################################

a1 = sys.argv[1]
a2 = sys.argv[2]

print("Reading assembly: " + a1)
a1seq = core.fastaGetDict(a1)
print("Read " + str(len(a1seq)) + " contigs.")

print("Reading assembly: " + a2)
a2seq = core.fastaGetDict(a2)
print("Read " + str(len(a2seq)) + " contigs.")
print("------------")

print("Counting identical contigs...")
ident = []
for t1 in a1seq:
    s1 = a1seq[t1]
    for t2 in a2seq:
        s2 = a2seq[t2]

        if s1 == s2:
            ident.append(t1 + "-" + t2)
Beispiel #12
0
        #print(f);

        cur_out = {h: "NA"
                   for h in aln_headers}
        cur_out["align"] = f
        # Initialize the current output dictionary.

        cur_infile = os.path.join(args.input, f)
        if not args.count_only:
            cur_nt_outfile = os.path.join(nt_outdir,
                                          f.replace(".fa", ".filter.fa"))
            cur_aa_outfile = os.path.join(aa_outdir,
                                          f.replace(".fa", ".filter.fa"))
        # Get the current in and output files

        seqs_orig = core.fastaGetDict(cur_infile)
        seqs = {t: seqs_orig[t].upper()
                for t in seqs_orig}
        samples = list(seqs.keys())
        pre_samples += len(samples)
        # Read the sequences

        for sample in seqs:
            if sample not in sample_stats:
                sample_stats[sample] = {
                    col: 0
                    for col in sample_headers if col != "sample"
                }
            sample_stats[sample]['num alns'] += 1
        # Count the samples in the alignment in the main dict and initialize if it is the first time this sample is seen
Beispiel #13
0
def convCheck(cur_c, c, ropt, targets, backgrounds, d, ins, outs):
#	cur_c = 0;
	init_c = cur_c+1;
	while cur_c < c:
		#if c > 1:
		if ropt != 0:
			outfilename = outs + "_" + str(cur_c+1) + ".txt";
		else:
			outfilename = outs + ".txt";

		if ropt != 0:
			#backgrounds = ["hg19", "rheMac3", "calJac3", "mm10", "rn5", "vicPac2", "bosTau7", "canFam3", "loxAfr3", "papHam1", "monDom5"];
			backgrounds = [];
			cur_r = len(backgrounds);
			while cur_r < ropt:
				chosenspec = random.choice(all_specs.values());

				if chosenspec not in targets and chosenspec not in backgrounds:
					backgrounds.append(chosenspec);
					cur_r = cur_r + 1;

		outfile = open(outfilename, "w");

		outfile.write("# ==============================================================================================\n");
		outfile.write("# \t\t\tConvergence testing\n");
		outfile.write("# \t\t\t" + core.getDateTime() + "\n");
		outfile.write("# Using alignments in:\t\t" + indir + "\n");
		outfile.write("# Target species:\t\t\t" + ", ".join(targets) + "\n");
		if ropt != 0:
			outfile.write("# Randomly choosing " + str(r) + " background species and performing " + str(c) + " replicate tests for convergence.\n");
			outfile.write("# This is replicate number " + str(cur_c+1) + "\n");
		outfile.write("# Background species:\t\t" + ", ".join(backgrounds) + "\n");
		outfile.write("# Writing output to:\t\t\t" + outfilename + "\n");
		if d == 0:
			outfile.write("# Checking for convergent sites.\n");
		elif d == 1:
			outfile.write("# Checking for divergent sites.\n");
		outfile.write("# ---------------------------------------------\n");
		#sys.exit();
		#cur_c = cur_c + 1;
		#continue;
		aligns = os.listdir(ins);

		numbars = 0;
		donepercent = [];
		count = len(aligns);
		i = 0;
		numsites = 0;
		totgenes = 0;
		outfile.write("# " + core.getTime() + " Starting Scan...\n");
		outfile.write("# Site#\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n");
		for align in aligns:
			#numbars, donepercent = core.loadingBar(i, count, donepercent, numbars);
			i = i + 1;

			if align.find(".fa") == -1:
				continue;

			#if i > 25:
			#	break;

			infilename = ins + align;
			#print align;
			gid = "_".join(align.split("_")[:2]);
			chrome = align[align.find("chr"):align.find("chr")+4]

			inseqs = core.fastaGetDict(infilename);

			num_targets_present = 0;
			num_bg_present = 0;
			for title in inseqs:
				if any(t in title for t in targets):
					num_targets_present = num_targets_present + 1;
				if any(b in title for b in backgrounds):
					num_bg_present = num_bg_present + 1;

			if num_targets_present == len(targets) and num_bg_present == len(backgrounds):
				#print "The following gene has all target and background species and will be checked:\t\t" + gid;
				totgenes = totgenes + 1;

				seqlen = len(inseqs[inseqs.keys()[0]]);
				#print "Alignment length\t\t", seqlen;

				t_alleles = {};
				b_alleles = {};

				for x in xrange(len(inseqs[inseqs.keys()[0]])):
					for title in inseqs:
						for t in targets:
							if t in title:
								t_alleles[t] = inseqs[title][x];
						for b in backgrounds:
							if b in title:
								b_alleles[b] = inseqs[title][x];

					t_states = t_alleles.values();

					t_gap = t_states.count("-");
					t_missing = t_states.count("X");
					t_stop = t_states.count("*");

					b_states = b_alleles.values();

					b_gap = b_states.count("-");
					b_missing = b_states.count("X");
					b_stop = b_states.count("*");

					t_final = remGapMiss(t_states);
					b_final = remGapMiss(b_states);

					#print t_alleles;
					#print t_states;
					#print t_gap;
					#print t_missing;
					#print t_stop;
					#print t_final;

					#print b_alleles;
					#print b_states;
					#print b_gap;
					#print b_missing;
					#print b_stop;
					#print b_final;

					if t_final == [] or b_final == []:
						continue;

					if d == 0:
						if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) == len(t_final) and t_final[0] not in b_final:
							numsites = numsites + 1;
							#print core.getTime() + " Convergent site found!";
							#print "Filename:\t\t" + align;
							#print "Chromosome:\t\t" + chrome;
							#print "Gene ID:\t\t" + gid;
							#print "Alignment length\t", seqlen;
							#print "Target alleles:\t\t" + "".join(t_final);
							#print "Background alleles:\t" + "".join(b_final);
							#print "---------------";
							outline = str(numsites) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";
							outfile.write(outline);

							#sys.exit();

					elif d == 1:
						if len(t_final) == len(targets) and len(b_final) == len(backgrounds) and t_final.count(t_final[0]) != len(t_final) and b_final.count(b_final[0]) == len(b_final):
							if not any(t in b_final for t in t_final):
								numsites = numsites + 1;
								#print "\nDivergent site found!";
								#print "Filename:\t\t" + align;
								#print "Chromosome:\t\t" + chrome;
								#print "Gene ID:\t\t" + gid;
								#print "Alignment length\t", seqlen;
								#print t_final;
								#print b_final;
								outline = str(numsites) + "\t" + chrome + "\t" + gid + "\t" + str(seqlen) + "\t" + str(x+1) + "\t" + "".join(t_final) + "\t" + "".join(b_final) + "\n";
								outfile.write(outline);

		#pstring = "100.0% complete.";
		#sys.stderr.write('\b' * len(pstring) + pstring);
		outfile.write("\n# " + core.getTime() + " Done!\n");
		outfile.write("# Total sites found: " + str(numsites) + "\n");
		outfile.write("# Total genes checked: " + str(totgenes) + "\n");
		outfile.write("# ==============================================================================================");
		cur_c = cur_c + 1;
	if ropt != 0:
		print core.getTime() + " Replicates", init_c, "to", c, "complete.";
Beispiel #14
0
def convCheck(cur_c, c, number_specs, d, ins, outs):
    #	cur_c = 0;
    init_c = cur_c + 1
    while cur_c < c:
        #if c > 1:

        spec_list = all_specs.values()
        rep_specs = []
        while len(rep_specs) < number_specs:
            r = random.choice(spec_list)
            rep_specs.append(r)
            spec_list.remove(r)

        outfilename = outs + "_" + str(cur_c + 1) + ".txt"
        outfile = open(outfilename, "w")
        outfile.write(
            "# ==============================================================================================\n"
        )
        outfile.write("# \t\t\tConvergence testing\n")
        outfile.write("# \t\t\t" + core.getDateTime() + "\n")
        outfile.write("# Using alignments in:\t\t" + indir + "\n")
        outfile.write("# Randomly choosing " + str(number_specs) +
                      " species and performing " + str(c) +
                      " replicate tests for convergence.\n")
        outfile.write("# This is replicate number " + str(cur_c + 1) + "\n")
        outfile.write("# Writing output to:\t\t\t" + outfilename + "\n")
        if d == 0:
            outfile.write("# Checking for convergent sites.\n")
        elif d == 1:
            outfile.write("# Checking for divergent sites.\n")
        outfile.write("# Using species:\t" + ",".join(rep_specs))
        outfile.write("# ---------------------------------------------\n")

        #sys.exit();
        #cur_c = cur_c + 1;
        #continue;
        aligns = os.listdir(ins)
        numbars = 0
        donepercent = []
        count = len(aligns)
        i = 0
        numsites = 0
        totgenes = 0
        outfile.write("# " + core.getTime() + " Starting Scan...\n")
        outfile.write(
            "# Site#\tTargetSpecs\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n"
        )
        for align in aligns:
            #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars);
            i = i + 1
            if align.find(".fa") == -1:
                continue

            infilename = os.path.join(ins, align)
            gid = "_".join(align.split("_")[:2])
            chrome = align[align.find("chr"):align.find("chr") + 4]

            inseqs = core.fastaGetDict(infilename)

            for t1 in rep_specs:
                for t2 in rep_specs:
                    if t1 == t2:
                        continue

                    targets = [t1, t2]
                    backgrounds = [
                        spec for spec in rep_specs if spec not in targets
                    ]

                    num_targets_present = 0
                    num_bg_present = 0
                    for title in inseqs:
                        if any(t in title for t in targets):
                            num_targets_present = num_targets_present + 1
                        if any(b in title for b in backgrounds):
                            num_bg_present = num_bg_present + 1

                    if num_targets_present == len(
                            targets) and num_bg_present == len(backgrounds):
                        # print "The following gene has all target and background species and will be checked:\t\t" + gid;
                        totgenes = totgenes + 1

                        seqlen = len(inseqs[inseqs.keys()[0]])
                        # print "Alignment length\t\t", seqlen;

                        t_alleles = {}
                        b_alleles = {}

                        for x in xrange(len(inseqs[inseqs.keys()[0]])):
                            for title in inseqs:
                                cur_spec = title[1:].replace("\n", "")
                                if cur_spec in targets:
                                    t_alleles[cur_spec] = inseqs[title][x]
                                if cur_spec in backgrounds:
                                    b_alleles[cur_spec] = inseqs[title][x]

                            t_states = t_alleles.values()
                            #t_gap = t_states.count("-");
                            #t_missing = t_states.count("X");
                            #t_stop = t_states.count("*");

                            b_states = b_alleles.values()
                            #b_gap = b_states.count("-");
                            #b_missing = b_states.count("X");
                            #b_stop = b_states.count("*");

                            t_final = remGapMiss(t_states)
                            b_final = remGapMiss(b_states)

                            if t_final == [] or b_final == []:
                                continue

                            if d == 0:
                                if len(t_final) == len(targets) and len(
                                        b_final
                                ) == len(backgrounds) and t_final.count(
                                        t_final[0]) == len(
                                            t_final
                                        ) and t_final[0] not in b_final:
                                    numsites = numsites + 1
                                    print core.getTime(
                                    ) + " Convergent site found!"
                                    print "Filename:\t\t" + align
                                    print "Chromosome:\t\t" + chrome
                                    print "Gene ID:\t\t" + gid
                                    print "Alignment length\t", seqlen
                                    print "Target alleles:\t\t" + "".join(
                                        t_final)
                                    print "Background alleles:\t" + "".join(
                                        b_final)
                                    print "---------------"
                                    outline = str(numsites) + "\t" + ",".join(
                                        targets
                                    ) + "\t" + chrome + "\t" + gid + "\t" + str(
                                        seqlen) + "\t" + str(
                                            x + 1) + "\t" + "".join(
                                                t_final) + "\t" + "".join(
                                                    b_final) + "\n"
                                    outfile.write(outline)

                            elif d == 1:
                                if len(t_final) == len(targets) and len(
                                        b_final
                                ) == len(backgrounds) and t_final.count(
                                        t_final[0]) != len(
                                            t_final) and b_final.count(
                                                b_final[0]) == len(b_final):
                                    if not any(t in b_final for t in t_final):
                                        numsites = numsites + 1
                                        # print "\nDivergent site found!";
                                        # print "Filename:\t\t" + align;
                                        # print "Chromosome:\t\t" + chrome;
                                        # print "Gene ID:\t\t" + gid;
                                        # print "Alignment length\t", seqlen;
                                        # print t_final;
                                        # print b_final;
                                        outline = str(
                                            numsites
                                        ) + "\t" + ",".join(
                                            targets
                                        ) + "\t" + chrome + "\t" + gid + "\t" + str(
                                            seqlen) + "\t" + str(
                                                x + 1) + "\t" + "".join(
                                                    t_final) + "\t" + "".join(
                                                        b_final) + "\n"
                                        outfile.write(outline)

        #pstring = "100.0% complete.";
        #sys.stderr.write('\b' * len(pstring) + pstring);
        outfile.write("\n# " + core.getTime() + " Done!\n")
        outfile.write("# Total sites found: " + str(numsites) + "\n")
        outfile.write("# Total genes checked: " + str(totgenes) + "\n")
        outfile.write(
            "# =============================================================================================="
        )
        cur_c = cur_c + 1
    if ropt != 0:
        print core.getTime() + " Replicates", init_c, "to", c, "complete."
Beispiel #15
0
num_fixed = 0
i = 1
for s in specs:
    s_mod = s.replace(" ", "-")
    print(i, " ", s_mod)
    i += 1
    ref = os.path.join(ref_dir, s_mod, s_mod + "-referee-corrected.fa")
    assert os.path.isfile(ref), "\nAssembly file not found: " + ref
    # Get reference

    new_ref = os.path.join(ref_dir, s_mod,
                           s_mod + "-referee-corrected-RMSCAFF.fa")

    print("reading ref: " + ref)
    seqs = core.fastaGetDict(ref)
    print("scaffolds read ", len(seqs))

    print("counting scaffolds")
    title_counts = defaultdict(int)
    exclude = []
    for title in seqs:
        t = title.split(" ")[0]
        title_counts[t] += 1
        if title_counts[t] > 1:
            exclude.append(title)

    if exclude != []:
        print("duplicate scaffold found. writing new output: " + new_ref)
        with open(new_ref, "w") as outfile:
            for title in seqs:
Beispiel #16
0
def convCheck(cur_c, c, ropt, targets, backgrounds, d, ins, outs):
    #	cur_c = 0;
    init_c = cur_c + 1
    while cur_c < c:
        #if c > 1:
        if ropt != 0:
            outfilename = outs + "_" + str(cur_c + 1) + ".txt"
        else:
            outfilename = outs + ".txt"

        if ropt != 0:
            #backgrounds = ["hg19", "rheMac3", "calJac3", "mm10", "rn5", "vicPac2", "bosTau7", "canFam3", "loxAfr3", "papHam1", "monDom5"];
            backgrounds = []
            cur_r = len(backgrounds)
            while cur_r < ropt:
                chosenspec = random.choice(all_specs.values())

                if chosenspec not in targets and chosenspec not in backgrounds:
                    backgrounds.append(chosenspec)
                    cur_r = cur_r + 1

        outfile = open(outfilename, "w")

        outfile.write(
            "# ==============================================================================================\n"
        )
        outfile.write("# \t\t\tConvergence testing\n")
        outfile.write("# \t\t\t" + core.getDateTime() + "\n")
        outfile.write("# Using alignments in:\t\t" + indir + "\n")
        outfile.write("# Target species:\t\t\t" + ", ".join(targets) + "\n")
        if ropt != 0:
            outfile.write("# Randomly choosing " + str(r) +
                          " background species and performing " + str(c) +
                          " replicate tests for convergence.\n")
            outfile.write("# This is replicate number " + str(cur_c + 1) +
                          "\n")
        outfile.write("# Background species:\t\t" + ", ".join(backgrounds) +
                      "\n")
        outfile.write("# Writing output to:\t\t\t" + outfilename + "\n")
        if d == 0:
            outfile.write("# Checking for convergent sites.\n")
        elif d == 1:
            outfile.write("# Checking for divergent sites.\n")
        outfile.write("# ---------------------------------------------\n")
        #sys.exit();
        #cur_c = cur_c + 1;
        #continue;
        aligns = os.listdir(ins)

        numbars = 0
        donepercent = []
        count = len(aligns)
        i = 0
        numsites = 0
        totgenes = 0
        outfile.write("# " + core.getTime() + " Starting Scan...\n")
        outfile.write(
            "# Site#\tChromosome\tGeneID\tAlignLen\tPosition\tTargetAlleles\tBackgroundAlleles\n"
        )
        for align in aligns:
            #numbars, donepercent = core.loadingBar(i, count, donepercent, numbars);
            i = i + 1

            if align.find(".fa") == -1:
                continue

            #if i > 25:
            #	break;

            infilename = ins + align
            #print align;
            gid = "_".join(align.split("_")[:2])
            chrome = align[align.find("chr"):align.find("chr") + 4]

            inseqs = core.fastaGetDict(infilename)

            num_targets_present = 0
            num_bg_present = 0
            for title in inseqs:
                if any(t in title for t in targets):
                    num_targets_present = num_targets_present + 1
                if any(b in title for b in backgrounds):
                    num_bg_present = num_bg_present + 1

            if num_targets_present == len(targets) and num_bg_present == len(
                    backgrounds):
                #print "The following gene has all target and background species and will be checked:\t\t" + gid;
                totgenes = totgenes + 1

                seqlen = len(inseqs[inseqs.keys()[0]])
                #print "Alignment length\t\t", seqlen;

                t_alleles = {}
                b_alleles = {}

                for x in xrange(len(inseqs[inseqs.keys()[0]])):
                    for title in inseqs:
                        for t in targets:
                            if t in title:
                                t_alleles[t] = inseqs[title][x]
                        for b in backgrounds:
                            if b in title:
                                b_alleles[b] = inseqs[title][x]

                    t_states = t_alleles.values()

                    t_gap = t_states.count("-")
                    t_missing = t_states.count("X")
                    t_stop = t_states.count("*")

                    b_states = b_alleles.values()

                    b_gap = b_states.count("-")
                    b_missing = b_states.count("X")
                    b_stop = b_states.count("*")

                    t_final = remGapMiss(t_states)
                    b_final = remGapMiss(b_states)

                    #print t_alleles;
                    #print t_states;
                    #print t_gap;
                    #print t_missing;
                    #print t_stop;
                    #print t_final;

                    #print b_alleles;
                    #print b_states;
                    #print b_gap;
                    #print b_missing;
                    #print b_stop;
                    #print b_final;

                    if t_final == [] or b_final == []:
                        continue

                    if d == 0:
                        if len(t_final) == len(targets) and len(
                                b_final) == len(backgrounds) and t_final.count(
                                    t_final[0]) == len(
                                        t_final) and t_final[0] not in b_final:
                            numsites = numsites + 1
                            #print core.getTime() + " Convergent site found!";
                            #print "Filename:\t\t" + align;
                            #print "Chromosome:\t\t" + chrome;
                            #print "Gene ID:\t\t" + gid;
                            #print "Alignment length\t", seqlen;
                            #print "Target alleles:\t\t" + "".join(t_final);
                            #print "Background alleles:\t" + "".join(b_final);
                            #print "---------------";
                            outline = str(
                                numsites
                            ) + "\t" + chrome + "\t" + gid + "\t" + str(
                                seqlen) + "\t" + str(x + 1) + "\t" + "".join(
                                    t_final) + "\t" + "".join(b_final) + "\n"
                            outfile.write(outline)

                            #sys.exit();

                    elif d == 1:
                        if len(t_final) == len(targets) and len(
                                b_final
                        ) == len(backgrounds) and t_final.count(
                                t_final[0]) != len(t_final) and b_final.count(
                                    b_final[0]) == len(b_final):
                            if not any(t in b_final for t in t_final):
                                numsites = numsites + 1
                                #print "\nDivergent site found!";
                                #print "Filename:\t\t" + align;
                                #print "Chromosome:\t\t" + chrome;
                                #print "Gene ID:\t\t" + gid;
                                #print "Alignment length\t", seqlen;
                                #print t_final;
                                #print b_final;
                                outline = str(
                                    numsites
                                ) + "\t" + chrome + "\t" + gid + "\t" + str(
                                    seqlen
                                ) + "\t" + str(x + 1) + "\t" + "".join(
                                    t_final) + "\t" + "".join(b_final) + "\n"
                                outfile.write(outline)

        #pstring = "100.0% complete.";
        #sys.stderr.write('\b' * len(pstring) + pstring);
        outfile.write("\n# " + core.getTime() + " Done!\n")
        outfile.write("# Total sites found: " + str(numsites) + "\n")
        outfile.write("# Total genes checked: " + str(totgenes) + "\n")
        outfile.write(
            "# =============================================================================================="
        )
        cur_c = cur_c + 1
    if ropt != 0:
        print core.getTime() + " Replicates", init_c, "to", c, "complete."