Example #1
0
def blast_against_each_genome(dir_path, processors, filter, peptides, blast, penalty, reward):
    """BLAST all peptides against each genome"""
    curr_dir=os.getcwd()
    files = os.listdir(curr_dir)
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    def _perform_workflow(data):
	tn, f = data
        if ".fasta.new" in f:
            subprocess.check_call("formatdb -i %s -p F > /dev/null 2>&1" % f, shell=True)
        if ".fasta.new" in f:
            cmd = ["blastall",
                   "-p", blast,
                   "-i", peptides,
                   "-d", f,
                   "-a", str(processors),
                   "-e", "0.1",
                   "-m", "8",
                   "-F", str(filter),
                   "-q", str(penalty),
                   "-r", str(reward),
                   "-o", "%s_blast.out" % f]
            subprocess.check_call(cmd)
            
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
Example #2
0
def blast_against_each_genome(dir_path, processors, filter, peptides, blast, penalty, reward):
    """BLAST all peptides against each genome"""
    curr_dir=os.getcwd()
    files = os.listdir(curr_dir)
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    def _perform_workflow(data):
	tn, f = data
        if ".fasta.new" in f:
            try:
                subprocess.check_call("formatdb -i %s -p F > /dev/null 2>&1" % f, shell=True)
            except:
                print "problem found in formatting genome %s" % f
        if ".fasta.new" in f:
            try:
                devnull = open('/dev/null', 'w')
                cmd = ["blastall",
                       "-p", blast,
                       "-i", peptides,
                       "-d", f,
                       "-a", str(processors),
                       "-e", "0.1",
                       "-m", "8",
                       "-F", str(filter),
                       "-q", str(penalty),
                       "-r", str(reward),
                       "-C", "F",
                       "-o", "%s_blast.out" % f]
                subprocess.call(cmd, stdout=devnull, stderr=devnull)
            except:
                print "genomes %s cannot be used" % f
            
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
Example #3
0
def blast_against_each_genome(dir_path, processors, filter, peptides, blast,
                              penalty, reward):
    """BLAST all peptides against each genome"""
    curr_dir = os.getcwd()
    files = os.listdir(curr_dir)
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]

    def _perform_workflow(data):
        tn, f = data
        if ".fasta.new" in f:
            subprocess.check_call("formatdb -i %s -p F > /dev/null 2>&1" % f,
                                  shell=True)
        if ".fasta.new" in f:
            cmd = [
                "blastall", "-p", blast, "-i", peptides, "-d", f, "-a",
                str(processors), "-e", "0.1", "-m", "8", "-F",
                str(filter), "-q",
                str(penalty), "-r",
                str(reward), "-o",
                "%s_blast.out" % f
            ]
            subprocess.check_call(cmd)

    results = set(
        p_func.pmap(_perform_workflow,
                    files_and_temp_names,
                    num_workers=processors))
Example #4
0
def blast_against_each_genome_tblastn(dir_path, processors, peptides):
    """BLAST all peptides against each genome"""
    curr_dir=os.getcwd()
    files = os.listdir(curr_dir)
    devnull = open("/dev/null", "w")
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    def _perform_workflow(data):
	tn, f = data
        if ".fasta.new" in f:
            try:
                subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % f, shell=True)
            except:
                print "problem found in formatting genome %s" % f
        if ".fasta.new" in f:
            try:
                devnull = open('/dev/null', 'w')
                cmd = ["tblastn",
                       "-query", peptides,
                       "-db", f,
                       "-num_threads", str(processors),
                       "-evalue", "0.1",
                       "-outfmt", "6",
                       "-out", "%s_blast.out" % f]
                subprocess.call(cmd, stdout=devnull, stderr=devnull)
            except:
                print "genomes %s cannot be used" % f
            
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
Example #5
0
def make_table(processors, test, clusters):
    """make the BSR matrix table"""
    curr_dir=os.getcwd()
    names = [ ]
    outdata = [ ]
    files = glob.glob(os.path.join(curr_dir, "*.filtered.unique"))
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    lock = threading.Lock()
    def _perform_workflow(data):
        lock.acquire()
        tn, f = data
        """get the name of each of the files to be iterated"""
        name=[ ]
        out=get_seq_name(f)
        name.append(out)
        reduced=[ ]
        """remove the junk at the end of the file"""
        for x in name:reduced.append(x.replace('.fasta.new_blast.out.filtered.filtered.unique',''))
        names.append(reduced)
        my_dict={}
        file=open(f, "rU")
        #tmpfile=open("tmp.txt", "w")
        """make a dictionary of all clusters and values"""
        try:
            for line in file:
                fields=line.split()
                my_dict.update({fields[0]:fields[1]})
        except:
            raise TypeError("abnormal number of fields")
        """add in values, including any potentially missing ones"""
        for x in clusters:
            if x not in my_dict.keys():my_dict.update({x:0})
        """need to write a blank space"""
        for x in reduced: open("%s.tmp.matrix" % x, 'a').write('%s\n' % x)
        """sort keys to get the same order between samples"""
        od = collections.OrderedDict(sorted(my_dict.items()))
        newout = open("%s.tmp.matrix" % "".join(reduced), "a")
        for k,v in od.iteritems():
            print >> newout,v
            if "T" in test:
                outdata.append(v)
        lock.release()
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
    names_out = open("names.txt", "w")
    for x in names: print >> names_out, "".join(x)
    """this makes sure that the ref.list file is
    in the same order as the tmp matrix"""
    nr_sorted=sorted(clusters)
    open("ref.list", "a").write("\n")
    for x in nr_sorted:
        open("ref.list", "a").write("%s\n" % x)
    if "T" in test:
        myout=[x for i, x in enumerate(outdata) if x not in outdata[i+1:]]
        return sorted(outdata)
    else:
        pass
    names_out.close()
Example #6
0
def predict_genes(fastadir, processors):
    """simple gene prediction using Prodigal in order
    to find coding regions from a genome sequence"""
    os.chdir("%s" % fastadir)
    files = os.listdir(fastadir)
    files_and_temp_names = [(str(idx), os.path.join(fastadir, f))
                            for idx, f in enumerate(files)]
    def _perform_workflow(data):
        tn, f = data
        subprocess.check_call("prodigal -i %s -d %s_genes.seqs -a %s_genes.pep > /dev/null 2>&1" % (f, f, f), shell=True)
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
Example #7
0
def predict_genes(fastadir, processors):
    """simple gene prediction using Prodigal in order
    to find coding regions from a genome sequence"""
    os.chdir("%s" % fastadir)
    files = os.listdir(fastadir)
    files_and_temp_names = [(str(idx), os.path.join(fastadir, f))
                            for idx, f in enumerate(files)]
    def _perform_workflow(data):
        tn, f = data
        subprocess.check_call("prodigal -i %s -d %s_genes.seqs -a %s_genes.pep > /dev/null 2>&1" % (f, f, f), shell=True)
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
Example #8
0
def new_loop(to_iterate, processors, clusters, debug):
    names = []
    table_list = []
    def _perform_workflow(data):
        tn, f = data
        name,values=make_table_dev(f, "F", clusters)
        names.append(name)
        table_list.append(values)
        if debug == "T":
            logging.logPrint("sample %s processed" % f)
        else:
            pass
    set(p_func.pmap(_perform_workflow,
                    to_iterate,
                    num_workers=processors))
    return names,table_list
Example #9
0
def new_loop(to_iterate, processors, clusters, debug):
    names = []
    table_list = []
    def _perform_workflow(data):
        tn, f = data
        name,values=make_table_dev(f, "F", clusters)
        names.append(name)
        table_list.append(values)
        if debug == "T":
            logging.logPrint("sample %s processed" % f)
        else:
            pass
    set(p_func.pmap(_perform_workflow,
                    to_iterate,
                    num_workers=processors))
    return names,table_list
Example #10
0
def blat_against_each_genome(dir_path,database,processors):
    """BLAT all genes against each genome"""
    curr_dir=os.getcwd()
    files = os.listdir(curr_dir)
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    def _perform_workflow(data):
	tn, f = data
        if ".fasta.new" in f:
            try:
                subprocess.check_call("blat -out=blast8 -minIdentity=75 %s %s %s_blast.out > /dev/null 2>&1" % (f,database,f), shell=True)
            except:
                print "genomes %s cannot be used" % f
            
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
Example #11
0
def blat_against_each_genome(dir_path,database,processors):
    """BLAT all genes against each genome"""
    curr_dir=os.getcwd()
    files = os.listdir(curr_dir)
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    def _perform_workflow(data):
        tn, f = data
        if ".fasta.new" in f:
            try:
                subprocess.check_call("blat -out=blast8 -minIdentity=75 %s %s %s_blast.out > /dev/null 2>&1" % (f,database,f), shell=True)
            except:
                print("genomes %s cannot be used" % f)

    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
Example #12
0
def main(directory, reference, fastas, trunc, new, processors):
    curr_dir = os.getcwd()
    ordered = get_gene_order(reference)
    file_dir = glob.glob(os.path.join(fastas, "*fasta"))
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(file_dir)]

    def _perform_workflow(data):
        tn, f = data
        run_blast(directory, f)

    results = set(
        p_func.pmap(_perform_workflow,
                    files_and_temp_names,
                    num_workers=processors))
    print "blast finished!"
    genome_names = []
    for infile in glob.glob(os.path.join(fastas, '*.fasta')):
        name = get_seq_name(infile)
        genome_names.append(name)
    process_results(genome_names, ordered, trunc, new, reference)
    subprocess.check_call("rm *.out", shell=True)
Example #13
0
def blast_against_each_genome_blastn(dir_path, processors, filter, peptides, penalty, reward):
    """BLAST all peptides against each genome"""
    if "F" in filter:
        my_seg = "yes"
    else:
        my_seg = "no"
    curr_dir=os.getcwd()
    files = os.listdir(curr_dir)
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    def _perform_workflow(data):
	tn, f = data
        if ".fasta.new" in f:
            try:
                subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % f, shell=True)
            except:
                print "problem found in formatting genome %s" % f
        if ".fasta.new" in f:
            devnull = open('/dev/null', 'w')
            try:
                cmd = ["blastn",
                       "-query", peptides,
                       "-db", f,
                       "-dust", str(my_seg),
                       "-num_threads", str(processors),
                       "-evalue", "0.1",
                       "-outfmt", "6",
                       "-penalty", str(penalty),
                       "-reward", str(reward),
                       "-out", "%s_blast.out" % f]
                subprocess.call(cmd, stdout=devnull, stderr=devnull)
            except:
                print "The genome file %s was not processed" % f
            
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
Example #14
0
def blast_against_each_genome_tblastn(dir_path, processors, peptides, filter):
    """BLAST all peptides against each genome"""
    curr_dir=os.getcwd()
    files = os.listdir(curr_dir)
    devnull = open("/dev/null", "w")
    if "T" in filter:
        my_seg = "yes"
    else:
        my_seg = "no"
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    def _perform_workflow(data):
        tn, f = data
        if ".fasta.new" in f:
            try:
                subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % f, shell=True)
            except:
                print("problem found in formatting genome %s" % f)
        if ".fasta.new" in f:
            try:
                devnull = open('/dev/null', 'w')
                cmd = ["tblastn",
                       "-query", peptides,
                       "-db", f,
                       "-seg", my_seg,
                       "-comp_based_stats", "F",
                       "-num_threads", "1",
                       "-evalue", "0.1",
                       "-outfmt", "6",
                       "-out", "%s_blast.out" % f]
                subprocess.call(cmd, stdout=devnull, stderr=devnull)
            except:
                print("genomes %s cannot be used" % f)

    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
def blast_against_each_genome(dir_path, processors, filter, peptides, blast,
                              penalty, reward):
    """BLAST all peptides against each genome"""
    curr_dir = os.getcwd()
    files = os.listdir(curr_dir)
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]

    def _perform_workflow(data):
        tn, f = data
        if ".fasta.new" in f:
            try:
                subprocess.check_call("formatdb -i %s -p F > /dev/null 2>&1" %
                                      f,
                                      shell=True)
            except:
                print "problem found in formatting genome %s" % f
        if ".fasta.new" in f:
            try:
                devnull = open('/dev/null', 'w')
                cmd = [
                    "blastall", "-p", blast, "-i", peptides, "-d", f, "-a",
                    str(processors), "-e", "0.1", "-m", "8", "-F",
                    str(filter), "-q",
                    str(penalty), "-r",
                    str(reward), "-C", "F", "-o",
                    "%s_blast.out" % f
                ]
                subprocess.call(cmd, stdout=devnull, stderr=devnull)
            except:
                print "genomes %s cannot be used" % f

    results = set(
        p_func.pmap(_perform_workflow,
                    files_and_temp_names,
                    num_workers=processors))
Example #16
0
def make_table(processors):
    """make the BSR matrix table"""
    clusters=[ ]
    curr_dir=os.getcwd()
    """I only use this loop to grab names...combine with next loop?
       I need the nr values before the next loop"""
    for infile in glob.glob(os.path.join(curr_dir, "*.filtered.unique")):
        file=open(infile, "rU")
        for line in file:
		fields=line.split()
                if fields[0] not in clusters:
                    clusters.append(fields[0])
    """de-replicate the clusters"""
    nr=[x for i, x in enumerate(clusters) if x not in clusters[i+1:]]
    names = [ ]
    outdata = [ ]
    files = glob.glob(os.path.join(curr_dir, "*.filtered.unique"))
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    lock = threading.Lock()
    def _perform_workflow(data):
        lock.acquire()
        tn, f = data
        """get the name of each of the files to be iterated"""
        name=[ ]
        out=get_seq_name(f)
        name.append(out)
        reduced=[ ]
        """remove the junk at the end of the file"""
        for x in name:reduced.append(x.replace('.fasta.new_blast.out.filtered.filtered.unique',''))
        names.append(reduced)
        dict={}
        file=open(f, "rU")
        tmpfile=open("tmp.txt", "w")
        """make a dictionary of all clusters and values"""
        try:
            for line in file:
                fields=line.split()
                dict.update({fields[0]:fields[1]})
        except:
            raise TypeError("abnormal number of fields")
        cluster_names={}
        """add in values, including any potentially missing ones"""
        for k,v in dict.iteritems():
            if k in nr: cluster_names.update({k:v})
        for x in nr:
            if x not in dict.keys():cluster_names.update({x:0})
        """need to write a blank space"""
        for x in reduced: open("%s.tmp.matrix" % x, 'a').write('%s\n' % x)
        """sort keys to get the same order between samples"""
        for key in sorted(cluster_names.iterkeys()):
            for x in reduced:
                open("%s.tmp.matrix" % x, 'a').write("%s\n" % cluster_names[key])
                outdata.append(cluster_names[key])
        lock.release()
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
    names_out = open("names.txt", "w")
    for x in names: print >> names_out, "".join(x)
    nr_sorted=sorted(nr)
    open("ref.list", "a").write("\n")
    for x in nr_sorted:
        open("ref.list", "a").write("%s\n" % x)
    return outdata, nr_sorted
Example #17
0
def find_dups_dev(ref_scores, length, max_plog, min_hlog, clusters, processors):
    curr_dir=os.getcwd()
    my_dict_o = {}
    dup_dict = {}
    paralogs = [ ]
    duplicate_file = open("duplicate_ids.txt", "w")
    paralog_file = open("paralog_ids.txt", "w")
    ref_file = open("dup_refs.txt", "w")
    genome_specific_list_of_lists = []
    target_list = []
    files = os.listdir(curr_dir)
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    def _perform_workflow(data):
        tn, f = data
        if "_blast.out" in f:
            genome_specific_dict = {}
            name = get_seq_name(f)
            reduced_name = name.replace(".fasta.new_blast.out","")
            genome_specific_dict.update({"ID":reduced_name})
            outfile = open("%s.counts.txt" % reduced_name, "w")
            try:
                for line in open(f, "U"):
                    fields = line.split()
                    if fields[0] not in ref_scores: pass
                    elif float(fields[2])>=int(min_hlog) and (float(fields[11])/float(ref_scores.get(fields[0])))>=float(length):
                        try:
                            my_dict_o[fields[0]].append(fields[11])
                            genome_specific_dict[fields[0]].append(fields[11])
                        except KeyError:
                            my_dict_o[fields[0]] = [fields[11]]
                            genome_specific_dict[fields[0]] = [fields[11]]
                    else:
                        continue
            except:
                raise TypeError("problem parsing %s" % f)
            new_dict = {}
            for k,v in genome_specific_dict.iteritems():
                for cluster in clusters:
                    if k == "ID":
                        pass
                    elif k == cluster:
                        try:
                            new_dict.update({k:len(v)})
                        except:
                            new_dict.update({k:"0"})
            for cluster in clusters:
                if cluster not in genome_specific_dict:
                    new_dict.update({cluster:"0"})
            od = collections.OrderedDict(sorted(new_dict.items()))
            ids = OrderedDict({"ID":reduced_name})
            both =OrderedDict(list(ids.items())+list(new_dict.items()))
            for k,v in both.iteritems():
                outfile.write(str(v)+"\n")
                if k in target_list:
                    pass
                else:
                    target_list.append(k)
            outfile.close()
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
    ref_file.write("\n".join(target_list)+"\n")
    ref_file.close()
    """known issue - if gene id is Capital and before "I", there can be a shuffling of IDs
    I need to sort the dictionary and keep the first item constant as ID"""
    try:
        generate_dup_matrix()
        os.system("paste dup_refs.txt dup_values > dup_matrix.txt")
    except:
        print("problem generating duplicate matrix, but we'll continue")
    for k,v in my_dict_o.iteritems():
        if int(len(v))>=2:
            dup_dict.update({k:v})
    for k,v in dup_dict.iteritems():
        max_value = max(v)
        for x in v:
            if float(x)/float(max_value)<=max_plog:
                paralogs.append(k)
            else:
                continue
    for k, v in dup_dict.iteritems():
        duplicate_file.write(k+"\n")
    nr=[x for i, x in enumerate(paralogs) if x not in paralogs[i+1:]]
    paralog_file.write("\n".join(nr)+"\n")
    duplicate_file.close()
    paralog_file.close()
    return nr, dup_dict
Example #18
0
def main(directory, genes, blast, processors, remove_gap, keep):
    dependencies = ['blastall','formatdb','muscle']
    for dependency in dependencies:
        ra = subprocess.call(['which', '%s' % dependency])
        if ra == 0:
            pass
        else:
            print "%s is not in your path, but needs to be!" % dependency
            sys.exit()
    start_dir = os.getcwd()
    ap=os.path.abspath("%s" % start_dir)
    dir_path=os.path.abspath("%s" % directory)
    try:
        os.makedirs('%s/to_extract_xxx' % ap)
        os.makedirs('%s/work_xxx' % ap)
    except:
        os.system("rm -rf %s/to_extract_xxx" % ap)
        os.system("rm -rf %s/work_xxx" % ap)
        os.makedirs('%s/to_extract_xxx' % ap)
        os.makedirs('%s/work_xxx' % ap)
    gene_path=os.path.abspath("%s" % genes)
    os.system("cp %s %s/to_extract_xxx/genes.fasta" % (gene_path,ap))
    os.chdir("%s/to_extract_xxx" % ap)
    split_multifasta("genes.fasta")
    os.system("rm genes.fasta")
    os.chdir("%s/work_xxx" % ap)
    """create combined file"""
    num_genomes, names = combined_seqs(dir_path)
    os.system("formatdb -i combined.seqs -p F")
    table_files = glob.glob(os.path.join("%s/to_extract_xxx" % ap, "*.fasta"))
    files_and_temp_names = [(str(idx), os.path.join("%s/to_extract_xxx" % ap, f))
                            for idx, f in enumerate(table_files)]
    def _perform_workflow(data):
        tn, f = data
        name = run_blast(f, blast)
        parse_blast_xml_report("%s.blast.out" % name)
        parsed_blast_to_seqs("%s.blast.unique" % name)
        check_and_align_seqs("%s.extracted.seqs" % name, num_genomes)
        os.system("rm %s.blast.out %s.blast.unique %s.extracted.seqs" % (name,name,name))
    set(p_func.pmap(_perform_workflow,
                    files_and_temp_names,
                    num_workers=processors))
    os.system("rm *.blast.out *.blast.unique *.extracted.seqs")
    pull_seqs(names)
    concatenate()
    os.system("cat *.concat > all.concat")
    os.system('sed "s/ //g" all.concat > tmp.concat')
    os.system("awk 'FNR>1' tmp.concat > all.concat")
    if remove_gap == "T":
        remove_gaps("all.concat")
        os.system("cp final_alignment.fasta %s" % ap)
    elif remove_gap == "F":
        os.system("cp all.concat %s/final_alignment.fasta" % ap)
    else:
        print "You have chosen an incorrect option for gap removal, choose from T or F"
        sys.exit()
    """finish up"""
    os.chdir("%s" % ap)
    if keep == "T":
        pass
    elif keep == "F":
        os.system("rm -rf %s/to_extract_xxx %s/work_xxx" % (ap,ap))
    else:
        print "Illegal keep value selected, not doing anything"
        pass
    except OSError, e:
        if e.errno != errno.EEXIST:raise
    if "NULL" != reduce:
        reduce_path=os.path.abspath("%s" % reduce)
    effective_jobs = int(int(memory)/8000)
    if effective_jobs <=1:
        effective_jobs = 1
    effective_processors = int(int(processors)/effective_jobs)
    os.chdir("%s/work_directory" % start_dir) 
    def _perform_workflow(data):
        #tn, f = data
        f = data
        print data
        run_single_loop(f[1],f[2],f[0],f[3],f[7],f[5],start_path,f[6],f[8],UGAP_PATH,TRIM_PATH,PICARD_PATH,PILON_PATH,f[10],f[11])
    results = set(p_func.pmap(_perform_workflow,
                              datasets,
                              num_workers=effective_jobs))

if __name__ == "__main__":
    usage="usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-c", "--config", dest="config_file",
                      help="config file that populates the UGAP single assembly",
                      action="callback", callback=test_file, type="string")
    parser.add_option("-m", "--memory", dest="memory",
                      help="amount of memory on the server, defaults to 48G, enter 48000",
                      action="store", type="string", default="48000")
    options, args = parser.parse_args()
    mandatories = ["config_file"]
    for m in mandatories:
        if not options.__dict__[m]:
Example #20
0
File: util.py Project: fw1121/UGAP
def run_loop(fileSets,error_corrector,processors,keep,coverage,proportion,start_path,reduce):
    #Is this still relevant?
    files_and_temp_names = [(str(idx), list(f))
                            for idx, f in fileSets.iteritems()]
    lock = threading.Lock()
    def _perform_workflow(data):
        idx, f = data
	if "NULL" not in reduce:
	    try:
	        subprocess.check_call("bwa index %s > /dev/null 2>&1" % reduce, shell=True)
	    except:
		print "problems with indexing input file"
		sys.exit()
	    try:
	        run_bwa("%s" % f[0], "%s" % f[1], processors, idx,"%s" % reduce)
	        os.system("samtools view -bS %s.sam > %s.bam 2> /dev/null" % (idx,idx))
	        os.system("bam2fastq -o %s#.fastq --no-aligned %s.bam > /dev/null 2>&1" % (idx,idx))
	        os.system("gzip %s_1.fastq %s_2.fastq" % (idx,idx))
	        os.system("cp %s_1.fastq.gz %s" % (idx,f[0]))
	        os.system("cp %s_2.fastq.gz %s" % (idx,f[1]))
	    except:
		print "problems depleting reads"
		sys.exit()
	else:
	    pass
        if int(get_sequence_length(f[0], idx))<=200:
	    args=['java','-jar','%s' % TRIM_PATH,'PE', '-threads', '%s' % processors,
	      '%s' % f[0], '%s' % f[1], '%s.F.paired.fastq.gz' % idx, 'F.unpaired.fastq.gz',
	      '%s.R.paired.fastq.gz' % idx, 'R.unpaired.fastq.gz', 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % UGAP_PATH,
	      'MINLEN:%s' % (int(get_sequence_length(f[0],idx)/2))]
	    try:
                vcf_fh = open('%s.trimmomatic.out' % idx, 'w')
            except:
                log_isg.logPrint('could not open trimmomatic file')
            try:
                log_fh = open('%s.trimmomatic.log' % idx, 'w')
            except:
                log_isg.logPrint('could not open log file')
	    try:
	        trim = Popen(args, stderr=vcf_fh, stdout=log_fh)
                trim.wait()
	    except:
		log_isg.logPrint("problem encountered with trimmomatic")
            """assemble sequences with spades"""
            if error_corrector=="hammer":
                subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77 --careful -1 %s.F.paired.fastq.gz -2 %s.R.paired.fastq.gz  > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True)
            elif error_corrector=="musket":
                ab = subprocess.call(['which', 'musket'])
                if ab == 0:
                    pass
                else:
                    print "musket isn't in your path, but needs to be!"
                    sys.exit()
                subprocess.check_call("musket -k 17 8000000 -p %s -omulti %s -inorder %s.F.paired.fastq.gz %s.R.paired.fastq.gz > /dev/null 2>&1" % (processors,idx,idx,idx), shell=True)
		subprocess.check_call("mv %s.0 %s.0.musket.fastq.gz" % (idx,idx), shell=True)
		subprocess.check_call("mv %s.1 %s.1.musket.fastq.gz" % (idx,idx), shell=True)
                try:
                    subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77 --only-assembler --careful -1  %s.0.musket.fastq.gz -2 %s.1.musket.fastq.gz > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True)
                except:
                    pass
            else:
                try:
                    subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77 --only-assembler --careful -1 %s.F.paired.fastq.gz -2 %s.R.paired.fastq.gz > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True)
                except:
                    pass
        elif int(get_sequence_length(f[0], idx))>200:
	    args=['java','-jar','%s' % TRIM_PATH,'PE',
	          '%s' % f[0], '%s' % f[1], '%s.F.paired.fastq.gz' % idx, 'F.unpaired.fastq.gz',
	          '%s.R.paired.fastq.gz' % idx, 'R.unpaired.fastq.gz', 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % UGAP_PATH,
	          'MINLEN:150']
	    try:
                vcf_fh = open('%s.trimmomatic.out' % idx, 'w')
            except:
                log_isg.logPrint('could not open trimmomatic file')
            try:
                log_fh = open('%s.trimmomatic.log' % idx, 'w')
            except:
                log_isg.logPrint('could not open log file')
	    try:
	        trim = Popen(args, stderr=vcf_fh, stdout=log_fh)
                trim.wait()
	    except:
	        log_isg.logPrint("problem encountered with trimmomatic")
            """assemble sequences with spades"""
            if error_corrector=="hammer":
                try:
                    subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77,127 --careful -1 %s.F.paired.fastq.gz -2 %s.R.paired.fastq.gz  > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True)
                except:
                    pass
            elif error_corrector=="musket":
                ab = subprocess.call(['which', 'musket'])
                if ab == 0:
                    pass
                else:
                    print "musket isn't in your path, but needs to be!"
                    sys.exit()
		subprocess.check_call("musket -k 17 8000000 -p %s -omulti %s -inorder %s.F.paired.fastq.gz %s.R.paired.fastq.gz > /dev/null 2>&1" % (processors,idx,idx,idx), shell=True)
                subprocess.check_call("mv %s.0 %s.0.musket.fastq.gz" % (idx,idx), shell=True)
	        subprocess.check_call("mv %s.1 %s.1.musket.fastq.gz" % (idx,idx), shell=True)
                try:
                    subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77,127 --only-assembler --careful -1  %s.0.musket.fastq.gz -2 %s.1.musket.fastq.gz > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True)
                except:
                    pass
            else:
                try:
                    subprocess.check_call("spades.py -o %s.spades -t %s -k 21,33,55,77,127 --only-assembler --careful -1 %s.F.paired.fastq.gz -2 %s.R.paired.fastq.gz > /dev/null 2>&1" % (idx,processors,idx,idx), shell=True)
                except:
                    pass
        else:
            pass
	try:
            os.system("gzip -dc %s.F.paired.fastq.gz > %s_1.fastq" % (idx,idx))
            os.system("gzip -dc %s.R.paired.fastq.gz > %s_2.fastq" % (idx,idx))
	    os.system("cp %s.spades/contigs.fasta %s.spades.assembly.fasta" % (idx,idx))
            filter_seqs("%s.spades.assembly.fasta" % idx, keep, idx)
            """remove redundancies - will likely change in the near future"""
            os.system("%s/bin/psi-cd-hit.pl -i %s.%s.spades.assembly.fasta -o %s.%s.nr.spades.assembly.fasta -c 0.99999999 -G 1 -g 1 -prog blastn -exec local -l 500" % (UGAP_PATH,idx,keep,idx,keep))
	    clean_fasta("%s.%s.nr.spades.assembly.fasta" % (idx,keep),"%s_pagit.fasta" % idx)
            rename_multifasta("%s_pagit.fasta" % idx, idx, "%s_renamed.fasta" % idx)
            subprocess.check_call("bwa index %s_renamed.fasta > /dev/null 2>&1" % idx, shell=True)
            os.system("samtools faidx %s_renamed.fasta" % idx)
            run_bwa("%s_1.fastq" % idx, "%s_2.fastq" % idx, processors, idx,"%s_renamed.fasta" % idx)
            make_bam("%s.sam" % idx, idx)
            os.system("java -jar %s/CreateSequenceDictionary.jar R=%s_renamed.fasta O=%s_renamed.dict > /dev/null 2>&1" % (PICARD_PATH, idx, idx))
            run_gatk("%s_renamed.fasta" % idx, processors, idx, "%s" % GATK_PATH)
            """run_bam_coverage stuff here"""
            os.system("java -jar %s/AddOrReplaceReadGroups.jar INPUT=%s_renamed.bam OUTPUT=%s_renamed_header.bam SORT_ORDER=coordinate RGID=%s RGLB=%s RGPL=illumina RGSM=%s RGPU=name CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT > /dev/null 2>&1" % (PICARD_PATH,idx,idx,idx,idx,idx))
            os.system("echo %s_renamed_header.bam > %s.bam.list" % (idx,idx))
            os.system("java -jar %s -R %s_renamed.fasta -T DepthOfCoverage -o %s_coverage -I %s.bam.list -rf BadCigar > /dev/null 2>&1" % (GATK_PATH,idx,idx,idx))
            process_coverage(idx)
        except:
            pass
        lock.acquire()
	try:
            to_fix=parse_vcf("%s.gatk.out" % idx, coverage, proportion)
            log_isg.logPrint("number of SNPs to fix in %s = %s" % (idx,len(to_fix)))
	    if int(len(to_fix))>=1:
                try:
                    fasta_to_tab("%s_renamed.fasta" % idx, idx)
                    fix_assembly("%s.out.tab" % idx, to_fix, idx)
                    os.system("cp %s_corrected_assembly.fasta %s_renamed.fasta" % (idx,idx))
                except:
                    print "error correction failed for some reason"
            else:
                pass

        except:
            pass
        lock.release()
        try:
            os.system("java -jar %s --genome %s_renamed.fasta --frags %s_renamed.bam --output %s_pilon > /dev/null 2>&1" % (PILON_PATH,idx,idx,idx))
	    rename_multifasta("%s_pilon.fasta" % idx, idx, "%s_final_assembly.fasta" % idx)
            os.system("prokka --prefix %s --locustag %s --compliant --mincontiglen %s --strain %s %s_final_assembly.fasta > /dev/null 2>&1" % (idx,idx,keep,idx,idx))
	    filter_seqs("%s_final_assembly.fasta" % idx, keep, idx)
            os.system("sed -i 's/\\x0//g' %s.%s.spades.assembly.fasta" % (idx,keep))
            os.system("%s/cleanFasta.pl %s.%s.spades.assembly.fasta -o %s/UGAP_assembly_results/%s_final_assembly.fasta > /dev/null 2>&1" % (PICARD_PATH,idx,keep,start_path,idx))
            os.system("cp coverage_out.txt %s/UGAP_assembly_results" % start_path)
            try:
                os.system("cp %s/*.* %s/UGAP_assembly_results" % (idx,start_path))
            except:
                print "prokka not installed and annotation files were not copied over!"
                pass
        except:
            pass
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
Example #21
0
def tree_loop(fasta_dict, combined, tree, parallel_workers, run_r, num_refs):
    def _temp_name(t, f):
        return t + '_' + f

    def _perform_workflow(data):
        tn, f = data
        outfile = open("%s.fasta" % tn, "w")
        outfile.write(">%s\n%s" % (tn,f))
        outfile.close()
        logging.debugPrint(lambda : "Processing sequence: %s" % tn)
        blast_against_reference("%s.fasta" % tn, combined, _temp_name(tn, "blast_parsed.txt"))
        subprocess.check_call("sort -u -k 2,2 %s > %s" % (_temp_name(tn, "blast_parsed.txt"),
                                                          _temp_name(tn, "blast_unique.parsed.txt")),
                              shell=True)
        parsed_blast_to_seqs(_temp_name(tn, "blast_unique.parsed.txt"), _temp_name(tn, "seqs_in.fas"))
        check_and_align_seqs(_temp_name(tn, "seqs_in.fas"), num_refs, _temp_name(tn, "seqs_aligned.fas"))
        if os.path.isfile(_temp_name(tn, "seqs_aligned.fas")):
            """What if there are NO SNPs in a given region"""
            #try:
            subprocess.call(['mothur',
                                   '#filter.seqs(fasta=%s, soft=100, vertical=F)' % _temp_name(tn, "seqs_aligned.fas")], stdout=subprocess.PIPE)
            subprocess.check_call('sed "s/[^1]/0/g" %s | sed "s/0/2/g" | sed "s/1/0/g" | sed "s/2/1/g" > %s' % (_temp_name(tn, "seqs_aligned.filter"),
                                                                                                                _temp_name(tn, "mask.txt")), shell=True)
            split_read(_temp_name(tn, "mask.txt"),_temp_name(tn, "padded.txt"))
            sum_qual_reads(_temp_name(tn, "padded.txt"), _temp_name(tn,"polys.txt"))
            #except:
            #    """This function was never created"""
            #    write_poly_zeros(_temp_name(tn, "padded.txt"), _temp_name(tn,"polys.txt"))
            if "T" == run_r:
                name = get_seq_name("%s.fasta" % tn)
                subprocess.check_call("cat snps.r | R --slave --args %s %s.table %s.pdf 2> /dev/null" % (_temp_name(tn, "seqs_aligned.fas"), name, name),
        					      shell=True)
                os.system("mv %s.table ./R_output/%s.table.txt" % (name, name))
                os.system("mv %s.pdf ./R_output/%s.plots.pdf" % (name, name))
            else:
                pass
            subprocess.check_call("FastTree -nt -noboot %s > %s 2> /dev/null" % (_temp_name(tn, "seqs_aligned.fas"),
                                                                                 _temp_name(tn, "tmp.tree")),
                                  shell=True)
            run_dendropy("%s" % (_temp_name(tn, "tmp.tree")), tree, "%s" % (_temp_name(tn, "tmp.RF")))
            run_dendropy_euclidian("%s" % (_temp_name(tn, "tmp.tree")), tree, "%s" % (_temp_name(tn, "tmp.EU")))
            get_contig_length("%s.fasta" % tn, _temp_name(tn, "length.txt"))
            thread_id = id(threading.current_thread())
            thread_distance_file = str(thread_id) + '_distance.txt'
            parse_rf_file(_temp_name(tn, "tmp.RF"), thread_distance_file)
            thread_euclidian_file = str(thread_id) + "_euc_dist.txt"
            parse_rf_file(_temp_name(tn, "tmp.EU"), thread_euclidian_file)
            thread_name_file = str(thread_id) + '_name.txt'
            write_strip_name("%s.fasta" % tn, thread_name_file)
            polys_name_file = str(thread_id) + '_polys.txt'
            parse_poly_file(_temp_name(tn, "polys.txt"), polys_name_file)
            length_name_file = str(thread_id) + '_length.txt'
            parse_poly_file(_temp_name(tn, "length.txt"), length_name_file)
            try:
                subprocess.check_call("rm mothur*", shell=True, stderr=open(os.devnull, 'w'))
            except:
                pass
            subprocess.check_call(["rm",
                                   _temp_name(tn, "blast_parsed.txt"),
                                   "%s.fasta" % tn,
                                   _temp_name(tn, "blast_unique.parsed.txt"),
                                   _temp_name(tn, "seqs_in.fas"),
                                   _temp_name(tn, "seqs_aligned.fas"),
                                   _temp_name(tn, "tmp.tree"),
                                   _temp_name(tn, "tmp.RF"),
                                   _temp_name(tn, "tmp.EU"),
                                   _temp_name(tn, "mask.txt"),
                                   _temp_name(tn, "padded.txt"),
                                   _temp_name(tn, "polys.txt"),
                                   _temp_name(tn, "seqs_aligned.filter"),
                                   _temp_name(tn, "length.txt"),
                                   _temp_name(tn, "seqs_aligned.filter.fasta")])
            return (thread_distance_file, thread_name_file, polys_name_file, length_name_file,
                    thread_euclidian_file)
        else:
            subprocess.check_call(["rm",
                                   _temp_name(tn, "blast_parsed.txt"),
                                   "%s.fasta" % tn,
                                   _temp_name(tn, "blast_unique.parsed.txt"),
                                   _temp_name(tn, "seqs_in.fas")])

    files_and_temp_names = [(str(idx), f)
                             for idx, f in fasta_dict.iteritems()]
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=parallel_workers))

    #I do this to make sure and remove any old files that are setting around
    subprocess.call("rm distance.txt name.txt polys.txt length.txt", shell=True, stderr=open(os.devnull, 'w'))

    for files in func.chunk(5, results):
        distances = []
        names = []
        polys = []
        lengths = []
        euc_dist = []
        for value in files:
            if value:
                distances.append(value[0])
                names.append(value[1])
                polys.append(value[2])
                lengths.append(value[3])
                euc_dist.append(value[4])
        if distances:
            subprocess.check_call("cat %s >> distance.txt" % " ".join(distances), shell=True)
            subprocess.check_call("cat %s >> name.txt" % " ".join(names), shell=True)
            subprocess.check_call("cat %s >> polys.txt" % " ".join(polys), shell=True)
            subprocess.check_call("cat %s >> length.txt" % " ".join(lengths), shell=True)
            subprocess.check_call("cat %s >> euc_dist.txt" % " ".join(euc_dist), shell=True)
            subprocess.check_call("rm %s" % " ".join(distances), shell=True)
            subprocess.check_call("rm %s" % " ".join(names), shell=True)
            subprocess.check_call("rm %s" % " ".join(polys), shell=True)
            subprocess.check_call("rm %s" % " ".join(lengths), shell=True)
    paste_files("name.txt","distance.txt","euc_dist.txt","polys.txt","length.txt","all_distances.txt")
Example #22
0
def run_loop(fileSets, dir_path, reference, processors, gatk, ref_coords, coverage, proportion, matrix,ap,doc,tmp_dir,picard,trim_path,wgfast_path,trim):
    files_and_temp_names = [(str(idx), list(f)) for idx, f in fileSets.iteritems()]
    lock = threading.Lock()
    def _perform_workflow(data):
        """idx is the sample name, f is the file dictionary"""
        idx, f = data
        if os.path.isfile("%s.tmp.xyx.matrix" % idx):
            pass
        else:
            if len(f)>1:
                if "T" in trim:
                    """paired end sequences - Hardcoded the number of processors per job to 2"""
                    args=['java','-jar','%s' % trim_path,'PE', '-threads', '2',
                          '%s' % f[0], '%s' % f[1], '%s.F.paired.fastq.gz' % idx, 'F.unpaired.fastq.gz',
	                  '%s.R.paired.fastq.gz' % idx, 'R.unpaired.fastq.gz', 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % wgfast_path,
	                  'MINLEN:%s' % int(get_sequence_length(f[0])/2)]
                    try:
                        vcf_fh = open('%s.trimmomatic.out' % idx, 'w')
                    except:
                        log_isg.logPrint('could not open trimmomatic file')
                    try:
                        log_fh = open('%s.trimmomatic.log' % idx, 'w')
                    except:
                        log_isg.logPrint('could not open log file')
                    if os.path.isfile("%s.F.paired.fastq.gz" % idx):
                        pass
                    else:
                        try:
                            trim_cmd = Popen(args, stderr=vcf_fh, stdout=log_fh)
                            trim_cmd.wait()
                        except:
                            log_isg.logPrint('problem enountered trying to run trimmomatic')
                else:
                    os.link(f[0], "%s.F.paired.fastq.gz" % idx)
                    os.link(f[1], "%s.R.paired.fastq.gz" % idx)
                if os.path.isfile("%s_renamed_header.bam" % idx):
                    pass
                else:
                     run_bwa(reference, '%s.F.paired.fastq.gz' % idx, '%s.R.paired.fastq.gz' % idx, processors, idx)
            else:
                if "T" in trim:
                    """single end support"""
                    args=['java','-jar','%s' % trim_path,'SE', '-threads', '2',
                          '%s' % f[0], '%s.single.fastq.gz' % idx, 'ILLUMINACLIP:%s/bin/illumina_adapters_all.fasta:2:30:10' % wgfast_path,
	                  'MINLEN:%s' % int(get_sequence_length(f[0])/2)]
                    try:
                        vcf_fh = open('%s.trimmomatic.out' % idx, 'w')
                    except:
                        log_isg.logPrint('could not open trimmomatic file')
                    try:
                        log_fh = open('%s.trimmomatic.log' % idx, 'w')
                    except:
                        log_isg.logPrint('could not open log file')
                    if os.path.isfile("%s.single.fastq.gz" % idx):
                        pass
                    else:
                        try:
                            trim_cmd = Popen(args, stderr=vcf_fh, stdout=log_fh)
                            trim_cmd.wait()
                        except:
                            log_isg.logPrint("problem encountered with trimmomatic")
                else:
                    os.link(f[0], "%s.single.fastq.gz" % idx)
                if os.path.isfile("%s_renamed_header.bam" % idx):
                    pass
                else:
                    run_bwa(reference, '%s.single.fastq.gz' % idx, "NULL", processors, idx)
            if os.path.isfile("%s_renamed_header.bam" % idx):
                pass
            else:
                process_sam("%s.sam" % idx, idx)
                """inserts read group information, required by new versions of GATK"""
                os.system("java -jar %s INPUT=%s.bam OUTPUT=%s_renamed_header.bam SORT_ORDER=coordinate RGID=%s RGLB=%s RGPL=illumina RGSM=%s RGPU=name CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT > /dev/null 2>&1" % (picard,idx,idx,idx,idx,idx))
                os.system("samtools index %s_renamed_header.bam > /dev/null 2>&1" % idx)
            run_gatk(reference, processors, idx, gatk, tmp_dir)
            if "T" == doc:
                lock.acquire()
                os.system("echo %s_renamed_header.bam > %s.bam.list" % (idx,idx))
                os.system("java -Djava.io.tmpdir=%s -jar %s -R %s/scratch/reference.fasta -T DepthOfCoverage -o %s_coverage -I %s.bam.list -rf BadCigar > /dev/null 2>&1" % (tmp_dir,gatk,ap,idx,idx))
                lock.release()
                process_coverage(idx)
            else:
                pass
            process_vcf("%s.vcf.out" % idx, ref_coords, coverage, proportion, idx)
            make_temp_matrix("%s.filtered.vcf" % idx, matrix, idx)
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
Example #23
0
    effective_jobs = int(int(memory) / 8000)
    if effective_jobs <= 1:
        effective_jobs = 1
    effective_processors = int(int(processors) / effective_jobs)
    os.chdir("%s/ugap_work_directory" % start_dir)
    keep_stuff = []

    def _perform_workflow(data):
        f = data
        run_single_loop(f[1], f[2], f[0], f[3], f[7], f[4], start_path, f[6],
                        f[8], UGAP_PATH, TRIM_PATH, PICARD_PATH, PILON_PATH,
                        f[10], f[11])
        keep_stuff.append(f[5])

    results = set(
        p_func.pmap(_perform_workflow, datasets, num_workers=effective_jobs))
    if "F" in keep_stuff:
        pass
    else:
        os.system("rm -rf ugap_work_directory")


if __name__ == "__main__":
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option(
        "-c",
        "--config",
        dest="config_file",
        help="config file that populates the UGAP single assembly",
        action="callback",
Example #24
0
def make_table(processors):
    """make the BSR matrix table"""
    clusters = []
    curr_dir = os.getcwd()
    """I only use this loop to grab names...combine with next loop?
       I need the nr values before the next loop"""
    for infile in glob.glob(os.path.join(curr_dir, "*.filtered.unique")):
        file = open(infile, "rU")
        for line in file:
            fields = line.split()
            if fields[0] not in clusters:
                clusters.append(fields[0])
    """de-replicate the clusters"""
    nr = [x for i, x in enumerate(clusters) if x not in clusters[i + 1:]]
    names = []
    outdata = []
    files = glob.glob(os.path.join(curr_dir, "*.filtered.unique"))
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    lock = threading.Lock()

    def _perform_workflow(data):
        lock.acquire()
        tn, f = data
        """get the name of each of the files to be iterated"""
        name = []
        out = get_seq_name(f)
        name.append(out)
        reduced = []
        """remove the junk at the end of the file"""
        for x in name:
            reduced.append(
                x.replace('.fasta.new_blast.out.filtered.filtered.unique', ''))
        names.append(reduced)
        dict = {}
        file = open(f, "rU")
        tmpfile = open("tmp.txt", "w")
        """make a dictionary of all clusters and values"""
        try:
            for line in file:
                fields = line.split()
                dict.update({fields[0]: fields[1]})
        except:
            raise TypeError("abnormal number of fields")
        cluster_names = {}
        """add in values, including any potentially missing ones"""
        for k, v in dict.iteritems():
            if k in nr: cluster_names.update({k: v})
        for x in nr:
            if x not in dict.keys(): cluster_names.update({x: 0})
        """need to write a blank space"""
        for x in reduced:
            open("%s.tmp.matrix" % x, 'a').write('%s\n' % x)
        """sort keys to get the same order between samples"""
        for key in sorted(cluster_names.iterkeys()):
            for x in reduced:
                open("%s.tmp.matrix" % x,
                     'a').write("%s\n" % cluster_names[key])
                outdata.append(cluster_names[key])
        lock.release()

    results = set(
        p_func.pmap(_perform_workflow,
                    files_and_temp_names,
                    num_workers=processors))
    names_out = open("names.txt", "w")
    for x in names:
        print >> names_out, "".join(x)
    nr_sorted = sorted(nr)
    open("ref.list", "a").write("\n")
    for x in nr_sorted:
        open("ref.list", "a").write("%s\n" % x)
    return outdata, nr_sorted
Example #25
0
def find_dups_dev(ref_scores, length, max_plog, min_hlog, clusters, processors):
    curr_dir=os.getcwd()
    my_dict_o = {}
    dup_dict = {}
    paralogs = [ ]
    duplicate_file = open("duplicate_ids.txt", "w")
    paralog_file = open("paralog_ids.txt", "w")
    ref_file = open("dup_refs.txt", "w")
    genome_specific_list_of_lists = []
    target_list = []
    ordered_target_list = []
    files = os.listdir(curr_dir)
    files_and_temp_names = [(str(idx), os.path.join(curr_dir, f))
                            for idx, f in enumerate(files)]
    def _perform_workflow(data):
        tn, f = data
        if "_blast.out" in f:
            genome_specific_dict = {}
            name = get_seq_name(f)
            reduced_name = name.replace(".fasta.new_blast.out","")
            genome_specific_dict.update({"ID":reduced_name})
            outfile = open("%s.counts.txt" % reduced_name, "w")
            try:
                for line in open(f, "U"):
                    newline = line.strip()
                    fields = newline.split()
                    """Each blast query should be in the reference blast file"""
                    if fields[0] not in ref_scores:
                        print("potential problem found with BLAST File..")
                        sys.exit()
                    elif float(fields[2])>=int(min_hlog) and (float(fields[11])/float(ref_scores.get(fields[0])))>=float(length):
                        try:
                            my_dict_o[fields[0]].append(fields[11])
                            genome_specific_dict[fields[0]].append(fields[11])
                        except KeyError:
                            my_dict_o[fields[0]] = [fields[11]]
                            genome_specific_dict[fields[0]] = [fields[11]]
                    else:
                        continue
            except:
                raise TypeError("problem parsing %s" % f)
            new_dict = {}
            for k,v in genome_specific_dict.iteritems():
                for cluster in clusters:
                    if k == "ID":
                        pass
                    elif k == cluster:
                        try:
                            new_dict.update({k:len(v)})
                        except:
                            new_dict.update({k:"0"})
            for cluster in clusters:
                if cluster not in genome_specific_dict:
                    new_dict.update({cluster:"0"})
            """this is our ordered dictionary"""
            od = collections.OrderedDict(sorted(new_dict.items()))
            ids = OrderedDict({"ID":reduced_name})
            both =OrderedDict(list(ids.items())+list(new_dict.items()))
            for k,v in both.iteritems():
                if k == "ID":
                    outfile.write(str(v)+"\n")
            for cluster in clusters:
                for k,v in both.iteritems():
                    if k == cluster:
                        outfile.write(str(v)+"\n")
            outfile.close()
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=processors))
    """Here's where I write to the reference file, which is the first column of dup_matrix.txt"""
    ref_file.write("ID"+"\n")
    ref_file.write("\n".join(clusters)+"\n")
    ref_file.close()
    try:
        generate_dup_matrix()
        os.system("paste dup_refs.txt dup_values > dup_matrix.txt")
    except:
        print("problem generating duplicate matrix, but we'll continue")
    for k,v in my_dict_o.iteritems():
        if int(len(v))>=2:
            dup_dict.update({k:v})
    for k,v in dup_dict.iteritems():
        max_value = max(v)
        for x in v:
            if float(x)/float(max_value)<=max_plog:
                paralogs.append(k)
            else:
                continue
    for k, v in dup_dict.iteritems():
        duplicate_file.write(k+"\n")
    nr=[x for i, x in enumerate(paralogs) if x not in paralogs[i+1:]]
    paralog_file.write("\n".join(nr)+"\n")
    duplicate_file.close()
    paralog_file.close()
    return nr, dup_dict
Example #26
0
def main(directory, genes, blast, processors, remove_gap, keep):
    if blast == 'blastn':
        dependencies = ['blastn','makeblastdb','muscle']
    else:
        dependencies = ['tblastn','makeblastdb','muscle']
    for dependency in dependencies:
        ra = subprocess.call(['which', '%s' % dependency])
        if ra == 0:
            pass
        else:
            print "%s is not in your path, but needs to be!" % dependency
            sys.exit()
    start_dir = os.getcwd()
    ap=os.path.abspath("%s" % start_dir)
    dir_path=os.path.abspath("%s" % directory)
    try:
        os.makedirs('%s/to_extract_xxx' % ap)
        os.makedirs('%s/work_xxx' % ap)
    except:
        os.system("rm -rf %s/to_extract_xxx" % ap)
        os.system("rm -rf %s/work_xxx" % ap)
        os.makedirs('%s/to_extract_xxx' % ap)
        os.makedirs('%s/work_xxx' % ap)
    gene_path=os.path.abspath("%s" % genes)
    os.system("cp %s %s/to_extract_xxx/genes.fasta" % (gene_path,ap))
    os.chdir("%s/to_extract_xxx" % ap)
    split_multifasta("genes.fasta")
    os.system("rm genes.fasta")
    os.chdir("%s/work_xxx" % ap)
    """create combined file"""
    num_genomes, names = combined_seqs(dir_path)
    os.system("makeblastdb -in combined.seqs -dbtype nucl > /dev/null 2>&1")
    table_files = glob.glob(os.path.join("%s/to_extract_xxx" % ap, "*.fasta"))
    files_and_temp_names = [(str(idx), os.path.join("%s/to_extract_xxx" % ap, f))
                            for idx, f in enumerate(table_files)]
    def _perform_workflow(data):
        tn, f = data
        name = run_blast(f, blast)
        """This makes sure that there is only one sequence per genome"""
        os.system("sort -u -k 2,2 '%s.blast.out' > '%s.blast.unique'" % (name,name))
        parsed_blast_to_seqs("%s.blast.unique" % name)
        check_and_align_seqs("%s.extracted.seqs" % name, num_genomes)
        os.system("rm '%s.blast.out' '%s.extracted.seqs'" % (name,name))
    set(p_func.pmap(_perform_workflow,
                    files_and_temp_names,
                    num_workers=processors))
    pull_seqs(names)
    concatenate()
    os.system("cat *.concat > all.concat")
    os.system('sed "s/ //g" all.concat > tmp.concat')
    os.system("awk 'FNR>1' tmp.concat > all.concat")
    if remove_gap == "T":
        remove_gaps("all.concat")
        os.system("cp final_alignment.fasta %s" % ap)
    elif remove_gap == "F":
        os.system("cp all.concat %s/final_alignment.fasta" % ap)
    else:
        print "You have chosen an incorrect option for gap removal, choose from T or F"
        sys.exit()
    """finish up"""
    os.chdir("%s" % ap)
    if keep == "T":
        pass
    elif keep == "F":
        os.system("rm -rf %s/to_extract_xxx %s/work_xxx" % (ap,ap))
    else:
        print "Illegal keep value selected, not doing anything"
        pass