コード例 #1
0
def fasta_to_tree(DIR,
                  fasta,
                  num_cores,
                  seqtype,
                  num_seq_cutoff=NUM_SEQ_CUTOFF):
    """
	given a fasta file
	align, trim alignment and build a tree
	choose appropriate tools depending on size of the fasta file
	"""
    if DIR[-1] != "/": DIR += "/"
    seqcount, maxlen = get_fasta_size(DIR + fasta)
    assert seqcount >= 4, "Less than four sequences in " + DIR + fasta
    print fasta, seqcount, "sequences"
    if seqcount >= NUM_SEQ_CUTOFF:  # large cluster
        print "running pasta"
        alignment = pasta(DIR, fasta, num_cores, seqtype)
        cleaned = pxclsq(DIR, alignment, 0.01, seqtype)
        if len(read_fasta_file(DIR + cleaned)) >= 4:
            tree = fasttree(DIR, cleaned, seqtype)
        else:
            print "Less than 4 taxa in", cleaned
    else:  # small cluster
        alignment = mafft(DIR, fasta, num_cores, seqtype)
        cleaned = pxclsq(DIR, alignment, 0.1, seqtype)
        if len(read_fasta_file(DIR + cleaned)) >= 4:
            tree = raxml(DIR, cleaned, num_cores, seqtype)
        else:
            print "Less than 4 taxa in", cleaned
コード例 #2
0
def refine(query_fasta, start_fasta, deep_paralog_cutoff, num_cores):
    gene_name = get_filename_from_path(query_fasta)[1].split(".")[0]
    outdir, fasta = get_filename_from_path(start_fasta)
    #print outdir,fasta
    deep_paralog_cutoff = float(deep_paralog_cutoff)
    query_ids = [s.name for s in seq.read_fasta_file(query_fasta)]
    new_fasta = []  # list of output refined fasta files
    print outdir, fasta

    # make a tree from the start_fasta
    tree = fasta_to_tree.fasta_to_tree(outdir, fasta, num_cores, "aa")
    if tree == None: return []
    with open(tree, "r") as infile:
        intree = newick3.parse(infile.readline())
    root = trim_tips.trim(intree,
                          relative_cutoff=deep_paralog_cutoff,
                          absolute_cutoff=deep_paralog_cutoff * 2)
    if os.path.exists(outdir + fasta + ".pasta.aln-cln"):
        clnfile = outdir + fasta + ".pasta.aln-cln"
    else:
        clnfile = outdir + fasta + ".mafft.aln-cln"
    root = mask_tips_by_taxonID_transcripts.mask(root,\
     clnfile=clnfile,\
     para="y",
     ignore=GENOMES)
    if root != None:
        with open(tree + ".tt.mm", "w") as outfile:
            outfile.write(newick3.tostring(root) + "\n")
        subtrees = cut_long_internal_branches.cut_long_internal_branches(
            root, cutoff=deep_paralog_cutoff)
        count = 0
        base_name = fasta.split(".")[0]
        seqDICT = {}  # key is seqid, value is seq
        for s in seq.read_fasta_file(start_fasta):
            seqDICT[s.name] = s.seq
        for tree in subtrees:
            if tree == None: continue
            label_set = set(tree_utils.get_front_labels(tree))
            if len(label_set) > 4 and len(label_set & set(query_ids)) > 0:
                count += 1
                with open(outdir + base_name + "_" + str(count) + ".subtree",
                          "w") as outfile:
                    outfile.write(newick3.tostring(tree) + ";\n")
                with open(outdir + base_name + "_" + str(count) + ".fa",
                          "w") as outfile:
                    for seqid in tree_utils.get_front_labels(tree):
                        try:
                            outfile.write(">" + seqid + "\n" + seqDICT[seqid] +
                                          "\n")
                        except:
                            print seqid, "not found in fasta file"
                new_fasta.append(outdir + base_name + "_" + str(count) + ".fa")

    return new_fasta
コード例 #3
0
def mcl_to_fasta(all_fasta, mcl_outfile, minimal_taxa, outdir):
    print "Reading mcl output file"
    clusterDICT = {}  #key is seqID, value is clusterID
    count = 0
    if outdir[-1] != "/": outdir += "/"
    with open(mcl_outfile, "rU") as infile:
        for line in infile:
            if len(line) < 3: continue  #ignore empty lines
            spls = line.strip().split('\t')
            if len(set(i.split("@")[0] for i in spls)) >= minimal_taxa:
                count += 1
                clusterID = str(count)
                for seqID in spls:
                    clusterDICT[seqID] = clusterID
    print count, "clusters with at least", minimal_taxa, "taxa read"

    print "Reading the fasta file", all_fasta
    #handle = open(all_fasta,"rU")
    #for record in SeqIO.parse(handle,"fasta"):
    for s in read_fasta_file(all_fasta):
        #seqid,seq = str(record.id),str(record.seq)
        seqid, seq = s.name, s.seq
        try:
            clusterID = clusterDICT[seqid]
            with open(outdir + "cluster" + clusterID + ".fa", "a") as outfile:
                outfile.write(">" + seqid + "\n" + seq + "\n")
        except:
            pass  # Those seqs that did not go in a cluster with enough taxa
コード例 #4
0
def ortho_to_aln(alndir, tredir, outdir, ortho_tree_file_ending=".tre"):
    """
	Read final homolog
	write individual alignment files for each ortholog
	Shorten seq id to taxon id
	"""
    if alndir[-1] != "/": alndir += "/"
    if tredir[-1] != "/": tredir += "/"
    if outdir[-1] != "/": outdir += "/"
    filecount = 0
    for i in os.listdir(tredir):
        if i.endswith(ortho_tree_file_ending):
            filecount += 1
            print i
            #read in the alignment into an dictionary
            seqDICT = {}  #key is seqID, value is seq
            for s in read_fasta_file(alndir + i.split(".")[0] +
                                     ".fa.mafft.aln"):
                seqDICT[s.name] = s.seq

            #read in tree tips and write output alignment
            with open(tredir + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            labels = tree_utils.get_front_labels(intree)
            with open(outdir + i.replace(ortho_tree_file_ending, ".aln"),
                      "w") as outfile:
                for lab in labels:
                    outfile.write(">" + tree_utils.get_name(lab) + "\n" +
                                  seqDICT[lab] + "\n")
    assert filecount > 0,\
     "No file ends with "+ortho_tree_file_ending+" was found in "+tredir
コード例 #5
0
def phyutility(DIR, alignment, min_col_occup, seqtype, min_chr=10):
    """
	remove columns with occupancy lower than MIN_COLUMN_OCCUPANCY
	remove seqs shorter than MIN_CHR after filter columns
	"""
    if DIR[-1] != "/": DIR += "/"
    cleaned = alignment + "-cln"
    if os.path.exists(DIR + cleaned): return cleaned
    assert alignment.endswith(".aln"),\
     "phyutility infile "+alignment+" not ends with .aln"
    assert os.stat(DIR + alignment).st_size > 0, DIR + alignment + "empty"
    assert seqtype == "aa" or seqtype == "dna", "Input data type: dna or aa"

    if seqtype == "aa":
        cmd = ["phyutility","-aa","-clean",str(min_col_occup),"-in",\
            DIR+alignment,"-out",DIR+alignment+"-pht"]
    else:
        cmd = ["phyutility","-clean",str(min_col_occup),"-in",\
            DIR+alignment,"-out",DIR+alignment+"-pht"]
    print " ".join(cmd)
    os.system(" ".join(cmd))
    assert os.path.exists(DIR + alignment + "-pht"), "Error phyutility"

    #remove empty and very short seqs
    outfile = open(DIR + cleaned, "w")
    for s in read_fasta_file(DIR + alignment + "-pht"):
        if len(s.seq.replace("-", "")) >= min_chr:
            outfile.write(s.get_fasta())
    outfile.close()
    os.remove(DIR + alignment + "-pht")
    return cleaned
コード例 #6
0
def main(fasta, treDIR, tree_file_ending, outDIR):
    if treDIR[-1] != "/": treDIR += "/"
    if outDIR[-1] != "/": outDIR += "/"
    print "Reading fasta file", fasta
    seqDICT = {}  #key is seqID, value is seq
    for s in read_fasta_file(fasta):
        seqDICT[s.name] = s.seq
    print "Writing fasta files"
    filecount = 0
    for i in os.listdir(treDIR):
        if i.endswith(tree_file_ending):
            print i
            filecount += 1
            with open(treDIR + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            clusterID = tree_utils.get_clusterID(i)
            if clusterID.endswith("rr"):
                outname = outDIR + clusterID + "_rr.fa"
            else:
                outname = outDIR + clusterID + "rr.fa"
            with open(outname, "w") as outfile:
                for label in tree_utils.get_front_labels(intree):
                    outfile.write(">" + label + "\n" + seqDICT[label] + "\n")
    assert filecount > 0,\
     "No file ends with "+tree_file_ending+" found in "+treDIR
コード例 #7
0
def main(treDIR, clnDIR, para, intree_file_ending=INTREE_FILE_ENDING):
    if treDIR[-1] != "/": treDIR += "/"
    if clnDIR[-1] != "/": clnDIR += "/"
    assert para == "y" or para == "n", "mask paraphyletic tips? (y/n)"
    mask_para = True if para == "y" else False
    filecount = 0

    filematch = {}  #key is clusterID, value is the .aln-cln file
    for i in os.listdir(clnDIR):
        if i.endswith(".aln-cln"):
            clusterID = get_clusterID(i)
            assert clusterID not in filematch, \
             "The clusterID "+clusterID+" repeats in "+clnDIR
            filematch[clusterID] = i

    for i in os.listdir(treDIR):
        if i.endswith(intree_file_ending):
            with open(treDIR + i, "r") as infile:
                intree = newick3.parse(infile.readline())
            print i
            clusterID = get_clusterID(i)
            filecount += 1
            chrDICT = {}  #key is seqid, value is number of unambiguous chrs
            for s in read_fasta_file(clnDIR + filematch[clusterID]):
                for ch in ['-', 'X', "x", "?", "*"]:
                    s.seq = s.seq.replace(ch, "")  #ignore gaps, xs and Xs
                chrDICT[s.name] = len(s.seq)
            curroot = mask_monophyletic_tips(intree, chrDICT)
            if mask_para: curroot = mask_paraphyletic_tips(curroot, chrDICT)
            with open(treDIR + i + ".mm", "w") as outfile:
                outfile.write(newick3.tostring(curroot) + ";\n")
    assert filecount > 0, \
     "No file ends with "+intree_file_ending+" found in "+treDIR
コード例 #8
0
def raxml_bs(DIR, cleaned, num_cores, seqtype, replicates=100):
    assert cleaned.endswith(".aln-cln"),\
     "raxml infile "+cleaned+" not ends with .aln-cln"
    assert seqtype == "aa" or seqtype == "dna", "Input data type: dna or aa"
    assert len(read_fasta_file(DIR+cleaned)) >= 4,\
     "less than 4 sequences in "+DIR+cleaned
    clusterID = cleaned.split(".")[0]
    tree = DIR + clusterID + ".raxml_bs.tre"
    raw_tree = "RAxML_bipartitions." + cleaned
    model = "PROTCATWAG" if seqtype == "aa" else "GTRCAT"
    if not os.path.exists(tree) and not os.path.exists(raw_tree):
        # raxml crashes if input file starts with .
        infasta = cleaned if DIR == "./" else DIR + cleaned
        cmd = ["raxml","-T",str(num_cores),\
            "-f","a","-x","12345","-#",str(replicates),\
            "-p","12345","-s",infasta,"-n",cleaned,"-m",model]
        print " ".join(cmd)
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        out = p.communicate()
        assert p.returncode == 0, "Error raxml" + out[0]
        try:
            os.rename(raw_tree, tree)
            os.rename("RAxML_bootstrap." + cleaned,
                      DIR + clusterID + ".raxml_bs.trees")
            os.remove("RAxML_bestTree." + cleaned)
            os.remove("RAxML_info." + cleaned)
            os.remove("RAxML_log." + cleaned)
            os.remove("RAxML_parsimonyTree." + cleaned)
            os.remove("RAxML_result." + cleaned)
            os.remove("RAxML_bipartitionsBranchLabels." + cleaned)
            os.remove(DIR + cleaned + ".reduced")
        except:
            pass  # no need to worry about extra intermediate files
        os.remove("RAxML_bipartitionsBranchLabels." + cleaned)
    return tree
コード例 #9
0
def get_fasta_size(fasta):
    """
	given a fasta file
	output the number of seqs and the length of the longest seq
	"""
    longest = 0
    seqlist = read_fasta_file(fasta)
    for s in seqlist:
        longest = max(longest, len(s.seq.replace("-", "")))
    return len(seqlist), longest
コード例 #10
0
def fasta_ok(fasta, min_count=MIN_FASTA):
    """count number of non-empty fasta sequences"""
    if not os.path.exists(fasta):
        return False
    fasta_count = 0
    for i in seq.read_fasta_file(fasta):
        if len(i.seq) > 0: fasta_count += 1
    print fasta, "contains", fasta_count, "non-empty sequences"
    if fasta_count >= min_count: return True
    else: return False
コード例 #11
0
def mask(curroot, clnfile, para, ignore=[]):
    chrDICT = {}  #key is seqid, value is number of unambiguous chrs
    for s in read_fasta_file(clnfile):
        for ch in ['-', 'X', "x", "?", "*"]:
            s.seq = s.seq.replace(ch, "")  #ignore gaps, xs and Xs
        chrDICT[s.name] = len(s.seq)
    curroot = mask_monophyletic_tips(curroot, chrDICT, ignore)
    if para:
        curroot = mask_paraphyletic_tips(curroot, chrDICT, ignore)
    return curroot
コード例 #12
0
def fasta_to_tree(DIR,
                  fasta,
                  num_cores,
                  seqtype,
                  num_seq_cutoff=NUM_SEQ_CUTOFF):
    """
	given a fasta file
	align, trim alignment and build a tree
	choose appropriate tools depending on size of the fasta file
	"""
    if DIR[-1] != "/": DIR += "/"
    seqcount, maxlen = get_fasta_size(DIR + fasta)
    assert seqcount >= 3, "Less than three sequences in " + DIR + fasta
    print fasta, seqcount, "sequences"
    if seqcount >= NUM_SEQ_CUTOFF:  # large cluster
        alignment = pasta(DIR, fasta, num_cores, seqtype)
        #		cleaned = phyutility(DIR,alignment,0.01,seqtype)
        cleaned = trimal(
            DIR, alignment, 0.5,
            0.001)  # use trimal-added by Tao, now need to def trimal
        if len(read_fasta_file(DIR + cleaned)) >= 3:
            tree = fasttree(DIR, cleaned, seqtype)
        else:
            print "Less than 3 taxa in", cleaned
    else:  # small cluster
        alignment = mafft(DIR, fasta, num_cores, seqtype)
        #		cleaned = phyutility(DIR,alignment,0.1,seqtype) "phyutility can only trim gaps-added by tao"
        cleaned = trimal(
            DIR, alignment, 0.5,
            0.001)  # use trimal-added by Tao, now need to def trimal
        seqcount, maxlen = get_fasta_size(DIR + cleaned)
        print cleaned, seqcount, "sequences"
        if len(read_fasta_file(DIR + cleaned)) >= 3:
            tree = fasttree(DIR, cleaned, seqtype)
            #tree = raxml(DIR,cleaned,num_cores,seqtype)
        #if len(read_fasta_file(DIR+cleaned)) == 3: # added by Tao
        #tree = fasttree(DIR,cleaned,seqtype) # use added by Tao
        else:
            print "Less than 3 taxa in", cleaned
コード例 #13
0
def fasta_to_bs_tree(DIR, fasta, num_cores, seqtype):
    """
	given a fasta file for the final homolog
	align, trim alignment and build a tree with bootstrap support
	"""
    if DIR[-1] != "/": DIR += "/"
    seqcount, maxlen = get_fasta_size(DIR + fasta)
    assert seqcount >= 4, "Less than four sequences in " + DIR + fasta
    print fasta, seqcount, "sequences"
    alignment = mafft(DIR, fasta, num_cores, seqtype)
    cleaned = phyutility(DIR, alignment, 0.2, seqtype)
    if len(read_fasta_file(DIR + cleaned)) >= 4:
        tree = raxml_bs(DIR, cleaned, num_cores, seqtype)
    else:
        print "Less than 4 taxa in", cleaned
コード例 #14
0
ファイル: mafft_wrapper.py プロジェクト: NatJWalker-Hale/DODA
def mafft(DIR, fasta, thread, seqtype):
    if DIR[-1] != "/": DIR += "/"
    alignment = fasta + ".mafft.aln"
    if os.path.exists(DIR +
                      alignment) and os.stat(DIR + alignment).st_size > 0:
        return alignment
    assert seqtype == "aa" or seqtype == "dna", "Input data type: dna or aa"
    seqlist = read_fasta_file(DIR + fasta)
    seqcount = len(seqlist)
    maxlen = 0
    for s in seqlist:
        maxlen = max(maxlen, len(s.seq))
    assert seqcount >= 4, "less than 4 sequences in " + DIR + fasta

    if seqtype == "dna":
        infasta = DIR + fasta
        seq = "--nuc"
    else:
        infasta = DIR + fasta + ".temp"
        seq = "--amino"
        with open(infasta, "w") as outfile:
            for s in seqlist:
                #remove U which is usually not in aa alphabet
                s.seq = s.seq.replace("U", "X")
                s.seq = s.seq.replace("u", "x")
                #remove stop codon and seq after it
                if "*" in s.seq:
                    s.seq = s.seq[:s.seq.find("*")]
                outfile.write(s.get_fasta())

    if seqcount >= 2000 or maxlen >= 10000:
        alg = ["--auto"]  #so that the run actually finishes!
    else:
        alg = ["--genafpair", "--maxiterate", "1000"]

    cmd = ["mafft"] + alg + [seq, "--thread", str(thread)]
    #com += ["--anysymbol"] # when there are "U"s in aa sequences
    cmd += [infasta]
    print " ".join(cmd)
    out = open(DIR + alignment, 'w')
    p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=out)
    out.close()
    p.communicate()
    assert p.returncode == 0, "Error mafft"
    if seqtype == "aa": os.remove(DIR + fasta + ".temp")
    return alignment
コード例 #15
0
def mcl_to_fasta(all_fasta,mcl_outfile,minimal_taxa,outdir):
    #print "Reading mcl output file"
    clusterDICT = {} #key is seqID, value is clusterID
    minimal_taxa = int(minimal_taxa)
    count = 0
    with open(mcl_outfile,"r") as infile:
        for line in infile:
            if len(line) < 3: continue #ignore empty lines
            spls = line.strip().split('\t')
            if len(set(get_name(i) for i in spls)) >= minimal_taxa:
                count += 1
                for seqID in spls:
                    clusterDICT[seqID] = str(count)
                    
    #print "Reading the fasta file",all_fasta
    for s in read_fasta_file(all_fasta):
        try:
            clusterID = clusterDICT[s.name]
            with open(outdir+"cluster"+clusterID+".fa","a") as outfile:
                outfile.write(">"+s.name+"\n"+s.seq+"\n")
        except: pass # Seqs that did not go in a cluster with enough taxa
コード例 #16
0
def blastp(query_fasta, DIR, num_cores, max_num_hits=20, min_bitscore=20.0):
    """
	same as swipe but using blastp
	"""
    if DIR[-1] != "/": DIR += "/"
    max_num_hits = int(max_num_hits)
    min_bitscore = float(min_bitscore)

    # blastp with each taxon
    pepfiles = [
        i for i in os.listdir(DIR)
        if (i.endswith(".pep.fa") or i.endswith(".cdhit"))
    ]
    datasets = [i.split(".")[0] for i in pepfiles]
    print len(pepfiles), "input peptide files read"
    assert len(set(datasets)) == len(datasets),\
     "dataset name repeats. remove duplicated sets"
    for i in os.listdir(DIR):
        if i.endswith(".pep.fa") or i.endswith(".cdhit"):
            if not os.path.exists(DIR + i + ".psd"):
                os.system("makeblastdb -in " + DIR + i +
                          " -parse_seqids -dbtype prot -out " + DIR + i)
            blastp_outname = DIR + i + "." + get_filename_from_path(
                query_fasta).split(".")[0] + ".blastp"
            if not os.path.exists(blastp_outname):
                cmd = "blastp -db " + DIR + i
                cmd += " -query " + query_fasta
                cmd += " -num_threads " + str(num_cores)
                cmd += " -out " + blastp_outname
                cmd += " -evalue 10"
                cmd += " -max_target_seqs " + str(max_num_hits)
                cmd += " -outfmt '6 qseqid qlen sseqid slen frames pident nident length mismatch gapopen qstart qend sstart send evalue bitscore'"
                print cmd
                os.system(cmd)
            assert os.path.exists(blastp_outname), \
             "blastp did not finish correctly"
            """
			blastp output colums are:
			0-qseqid 2-sseqid 15-bitscore'"
			"""
            # summarize the hit seq ids
            if not os.path.exists(blastp_outname + ".hits"):
                hit_tuples = []  # alist of tuples (hit, bitscore)
                with open(blastp_outname, "r") as infile:
                    for line in infile:
                        if len(line) < 3: continue  # skip empty lines
                        spls = line.strip().split("\t")
                        query, hit, bitscore = spls[0], spls[2], float(
                            spls[-1])
                        if query != hit and bitscore >= min_bitscore:
                            hit_tuples.append((hit, bitscore))

                out = []  # unique hit ids
                for hit, bitscore in sorted(hit_tuples,
                                            key=lambda x: x[1],
                                            reverse=True):
                    if hit not in out:
                        out.append(hit)
                    if len(out) == max_num_hits:
                        break
                if len(out) == 0: print "Warning: No hits found"
                with open(blastp_outname + ".hits", "w") as outfile:
                    for hit in out:
                        print hit
                        outfile.write(hit + "\n")

    # write output fasta
    outname = query_fasta.replace(".pep.fa", "_blastp.fa")
    print "Writing output fasta", outname
    outfile = open(outname, "w")
    query_seqids = []  # avoid seq id repeats
    with open(query_fasta, "r") as infile:
        for line in infile:
            outfile.write(line)  # copy over query seqs
            if line[0] == ">":
                query_seqids.append(line.strip()[1:])
    for i in os.listdir(DIR):
        if i.endswith(".pep.fa") or i.endswith(".cdhit"):
            seqDICT = {}  # key is seq name, value is seq
            for s in seq.read_fasta_file(DIR + i):
                seqDICT[s.name] = s.seq
            with open(
                    DIR + i + "." +
                    get_filename_from_path(query_fasta).split(".")[0] +
                    ".blastp.hits", "r") as infile:
                for line in infile:
                    line = line.strip()
                    if len(line) > 0 and line not in query_seqids:
                        outfile.write(">" + line + "\n" + seqDICT[line] + "\n")
    outfile.close()
コード例 #17
0
        required=False,
        default=False)
    parser.add_argument("-t",
                        "--threads",
                        help="how many threads for iqtree?",
                        required=False,
                        type=int,
                        default=2)
    if len(sys.argv[1:]) == 0:
        sys.argv.append("-h")

    args = parser.parse_args()

    seqfile = args.seqfile
    print("running " + seqfile, file=sys.stderr)
    seqs = seq.read_fasta_file(seqfile)
    sw = args.sw  #sliding window
    step = args.increment  #steps
    numthreads = args.threads
    a_len = len(seqs[0].seq)
    if a_len < sw:
        print("sequence is less than " + str(args.sw), file=sys.stderr)
        sys.exit(0)
    if a_len < sw + step:
        print("wouldn't make more than one segment: " + str(a_len) + "<" +
              str(sw + step),
              file=sys.stderr)
        sys.exit(0)
    x, segc = make_trees(seqs, seqfile, a_len, sw, step, numthreads)
    y = run_iqtree(seqfile, numthreads)
コード例 #18
0
def swipe(query_fasta,
          pepdir,
          outdir,
          num_cores,
          max_num_hits=10,
          min_bitscore=30.0):
    """
	get the initial fasta files
	swipe query_fasta against each data set in DIR with peptide fasta files that
	either end with .pep.fa or.cdhit, swipe on each one
	write output in outdir
	take the top num_to_bait hits ranked by bitscore
	evalue set to 10
	ignore all hits with bitscore less than 0.1 of the highest hit from the query
	"""
    if pepdir[-1] != "/": pepdir += "/"
    if outdir[-1] != "/": outdir += "/"
    num_cores = str(num_cores)
    max_num_hits = int(max_num_hits)
    gene_name = get_filename_from_path(query_fasta).split(".")[0]

    # swipe with each taxon
    pepfiles = [
        i for i in os.listdir(pepdir)
        if (i.endswith(".pep.fa") or i.endswith(".cdhit"))
    ]
    datasets = [i.split(".")[0] for i in pepfiles]
    print len(pepfiles), "input peptide files read"
    assert len(set(datasets)) == len(datasets),\
     "dataset name repeats. remove duplicated sets"
    temp_files = []  # will remove these later
    for i in os.listdir(pepdir):
        if i.endswith(".pep.fa") or i.endswith(".cdhit"):
            if not os.path.exists(pepdir + i + ".psd"):
                os.system("makeblastdb -in " + pepdir + i +
                          " -parse_seqids -dbtype prot -out " + pepdir + i)
            swipe_outname = pepdir + i + "." + get_filename_from_path(
                query_fasta).split(".")[0] + ".swipe"
            temp_files.append(swipe_outname)
            temp_files.append(swipe_outname + ".hits")
            if not os.path.exists(swipe_outname):
                cmd = "swipe -d " + pepdir + i + " -i " + query_fasta + " -a " + num_cores
                cmd += " -p blastp -o " + swipe_outname + " -m 8 -e 10"
                print cmd
                os.system(cmd)
            assert os.path.exists(swipe_outname), \
             "swipe did not finish correctly"
            """
			swipe output colums are:
			Query id, Subject id, % identity, alignment length, mismatches, 
			gap openings, q. start, q. end, s. start, s. end, e-value, bit score
			"""
            # summarize the hit seq ids
            if not os.path.exists(swipe_outname + ".hits"):
                hit_tuples = []  # a list of tuples (hit, bitscore)
                with open(swipe_outname, "r") as infile:
                    for line in infile:
                        if len(line) < 3: continue  # skip empty lines
                        spls = line.strip().split("\t")
                        query, hit, bitscore = spls[0], spls[1].replace(
                            "lcl|", ""), float(spls[-1])
                        if query != hit and bitscore >= min_bitscore:
                            hit_tuples.append((hit, bitscore))

                out = []  # unique hit ids
                highest = 0.0  # store the highest bitscore
                for hit, bitscore in sorted(hit_tuples,
                                            key=lambda x: x[1],
                                            reverse=True):
                    if highest == 0.0:
                        highest = bitscore  # record the first hit
                    if bitscore < 0.05 * highest: break  # stop recording
                    if hit not in out:
                        out.append(hit)
                    if len(out) == max_num_hits: break
                if len(out) == 0: print "Warning: No hits found"
                with open(swipe_outname + ".hits", "w") as outfile:
                    for hit in out:
                        print hit
                        outfile.write(hit + "\n")

    # write output summary files .rawswipe, .filetered_hits, .swipe.fa
    print "Writing output fasta"
    outfile1 = open(outdir + gene_name + ".rawswipe", "w")
    outfile2 = open(outdir + gene_name + ".filetered_hits", "w")
    outfile3 = open(outdir + gene_name + ".swipe.fa", "w")
    query_seqids = []  # avoid seq id repeats
    with open(query_fasta, "r") as infile:
        for line in infile:
            outfile3.write(line)  # copy over query seqs
            if line[0] == ">":
                query_seqids.append(line.strip()[1:])
    for i in os.listdir(pepdir):
        if i.endswith(".pep.fa") or i.endswith(".cdhit"):
            # write the gene_name.rawswipe file to outdir
            with open(
                    pepdir + i + "." +
                    get_filename_from_path(query_fasta).split(".")[0] +
                    ".swipe", "r") as infile:
                for line in infile:
                    outfile1.write(line)
            seqDICT = {
            }  # a dict for each taxon, key is seq name, value is seq
            for s in seq.read_fasta_file(pepdir + i):
                seqDICT[s.name] = s.seq
            with open(
                    pepdir + i + "." +
                    get_filename_from_path(query_fasta).split(".")[0] +
                    ".swipe.hits", "r") as infile:
                for line in infile:
                    line = line.strip()
                    if len(line) == 0: continue
                    outfile2.write(line + "\n")
                    if line not in query_seqids:
                        outfile3.write(">" + line + "\n" + seqDICT[line] +
                                       "\n")
    outfile1.close()
    outfile2.close()
    outfile3.close()

    # remove intermediate files
    for i in temp_files:
        os.remove(i)
コード例 #19
0
    curfas = sys.argv[1] + "/notinchildren.fas"
    if os.path.isfile(curfas) == False:
        sys.exit(0)
    outclu = sys.argv[2]
    LOGFILE = sys.argv[3]
    log = Logger(LOGFILE)
    log.a()

    tempdir = "./"
    if len(sys.argv) == 5:
        tempdir = sys.argv[4]
        if tempdir[-1] != "/":
            tempdir += "/"

    slen = 0
    seqs = seq.read_fasta_file(curfas)
    if len(seqs) > 0:
        seqd = {}
        for i in seqs:
            seqd[i.name] = i
        log.w("UNCLUSTERED SEQS IN " + sys.argv[1])
        #make blastdb of the cluster dir
        # see if there are clusters yet, if so
        numfiles = 0
        for i in os.listdir(outclu):
            numfiles += 1
            break
        if numfiles > 0:
            make_blast_db_from_cluster(outclu, tempdir)
            blast_file_against_db(sys.argv[1], "notinchildren.fas", tempdir)
            dclus, clus = filter_blast.process_blast_out(tempdir + tempname +
コード例 #20
0
def combine_seqs(seq1,seq2):
    finalseq = ""
    for i,j in zip(seq1,seq2):
        if i == "-":
            finalseq += j
        else:
            finalseq += i
    return finalseq

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "python "+sys.argv[0]+" infile.aln outfile.aln"
        sys.exit()

    seqs = seq.read_fasta_file(sys.argv[1])
    seqs_sample = {}
    seqs_d = {}
    keep_seqs = {}
    for i in seqs:
        seqs_d[i.name] = i
        keep_seqs[i.name] = i
        spls = i.name.split("@")[0]
        try:
            seqs_sample[spls].append(i)
        except:
            seqs_sample[spls] = []
            seqs_sample[spls].append(i)

        
コード例 #21
0
def concatenate(clnDIR,numofsitesFilter,numoftaxaFilter,outname):
	"""filter cleaned alignments and concatenate"""
	if clnDIR[-1] != "/": clnDIR += "/"
	sites_filter = int(numofsitesFilter)
	taxa_filter = int(numoftaxaFilter)
	
	print "Filtering ortholog matrixes"
	selected = [] # list of alignment file names that pass the filters
	for i in os.listdir(clnDIR):
		if i.endswith(".aln-cln"):
			seqlist = read_fasta_file(clnDIR+i)
			num_seq = len(seqlist)
			num_col = len(seqlist[0].seq)
			if num_seq >= taxa_filter and num_col >= sites_filter:
				selected.append(i)
	print len(selected),"matrices passed the filter"

	print "Getting matrix occupancy stats"
	taxon_occupancy = {}
	#key is taxon name, value is [times present in a matrix,total length for this taxon]
	total_aligned_len = 0 #record how long the final concatenated matrix is
	
	cmd = "pxcat"+" -o "+outname+".fa"+" -p "+outname+".model"+" -s "
	for i in selected:
		cmd += clnDIR+i+" "
		seqlist = read_fasta_file(clnDIR+i)
		total_aligned_len += len(seqlist[0].seq)
		for s in seqlist:
			taxonid = get_name(s.name)
			if taxonid not in taxon_occupancy:
				taxon_occupancy[taxonid] = [0,0]
			taxon_occupancy[taxonid][0] += 1
			taxon_occupancy[taxonid][1] += len((s.seq.replace("-","")).replace("?",""))
	cmd += "\n"
	
	total_ortho = len(selected)
	with open(outname+"_taxon_occupancy_stats","w") as outfile:
		outfile.write("taxon\t#orthologs\t#total_charactors\tperc_orthologs\tperc_charactors\n")
		sum_char = 0
		for taxon in taxon_occupancy:
			times,chars = taxon_occupancy[taxon][0],taxon_occupancy[taxon][1]
			sum_char += chars
			out = taxon+"\t"+str(times)+"\t"+str(chars)+"\t"
			out += str(times/float(total_ortho))+"\t"+str(chars/float(total_aligned_len))+"\n"
			outfile.write(out)
		total_taxa = len(taxon_occupancy)
		out = "\nSupermatrix dimension "+str(total_taxa)+" taxa, "
		out += str(total_ortho)+" loci and "+str(total_aligned_len)+" aligned columns\n"
		out += "Overall matrix occupancy "+str(sum_char/float(total_taxa*total_aligned_len))+"\n"
		outfile.write(out)

	print "Supermatrix taxon occupancy stats written to",outname+"_taxon_occupancy_stats"
	print "Waiting for concatenation to finish. This may take several minutes..."
	with open(outname+".temp.sh","w") as f: f.write(cmd)
	os.system("bash "+outname+".temp.sh")	
	
	#writes phy file
	cmd_pxs2phy = ["pxs2phy","-o",outname+".phy","-s",outname+".fa"]
	print (" ".join(cmd_pxs2phy))
	os.system(" ".join(cmd_pxs2phy))
	
	#writes nex file
	cmd_pxs2nex = ["pxs2nex","-o",outname+".nex","-s",outname+".fa"]
	print (" ".join(cmd_pxs2nex))
	os.system(" ".join(cmd_pxs2nex))

	
	assert os.path.exists(outname+".phy") and os.path.exists(outname+".nex") and os.path.exists(outname+".fa"),  "error concatenate"
	os.system("rm "+outname+".temp.sh")
	print "outfiles written",outname+".phy",outname+".model"
コード例 #22
0
def swipe(query_fasta, DIR, num_cores, max_num_hits=20, min_bitscore=20.0):
    """
	given a DIR with peptide fasta files that either end with .pep.fa or 
	.cdhit, swipe on each one
	return a fasta file with hits from each taxa plus the queries added
	"""
    if DIR[-1] != "/": DIR += "/"
    max_num_hits = int(max_num_hits)
    min_bitscore = float(min_bitscore)

    # swipe with each taxon
    pepfiles = [
        i for i in os.listdir(DIR)
        if (i.endswith(".pep.fa") or i.endswith(".cdhit"))
    ]
    datasets = [i.split(".")[0] for i in pepfiles]
    print len(pepfiles), "input peptide files read"
    assert len(set(datasets)) == len(datasets),\
     "dataset name repeats. remove duplicated sets"
    for i in os.listdir(DIR):
        if i.endswith(".pep.fa") or i.endswith(".cdhit"):
            if not os.path.exists(DIR + i + ".psd"):
                os.system("makeblastdb -in " + DIR + i +
                          " -parse_seqids -dbtype prot -out " + DIR + i)
            swipe_outname = DIR + i + "." + get_filename_from_path(
                query_fasta).split(".")[0] + ".swipe"
            if not os.path.exists(swipe_outname):
                cmd = "swipe -d " + DIR + i + " -i " + query_fasta + " -a " + str(
                    num_cores)
                cmd += " -p blastp -o " + swipe_outname + " -m 8 -e 10"
                print cmd
                os.system(cmd)
            assert os.path.exists(swipe_outname), \
             "swipe did not finish correctly"
            """
			swipe output colums are:
			Query id, Subject id, % identity, alignment length, mismatches, 
			gap openings, q. start, q. end, s. start, s. end, e-value, bit score
			"""
            # summarize the hit seq ids
            if not os.path.exists(swipe_outname + ".hits"):
                hit_tuples = []  # alist of tuples (hit, bitscore)
                with open(swipe_outname, "r") as infile:
                    for line in infile:
                        if len(line) < 3: continue  # skip empty lines
                        spls = line.strip().split("\t")
                        query, hit, bitscore = spls[0], spls[1].replace(
                            "lcl|", ""), float(spls[-1])
                        if query != hit and bitscore >= min_bitscore:
                            hit_tuples.append((hit, bitscore))

                out = []  # unique hit ids
                for hit, bitscore in sorted(hit_tuples,
                                            key=lambda x: x[1],
                                            reverse=True):
                    if hit not in out:
                        out.append(hit)
                    if len(out) == max_num_hits:
                        break
                if len(out) == 0: print "Warning: No hits found"
                with open(swipe_outname + ".hits", "w") as outfile:
                    for hit in out:
                        print hit
                        outfile.write(hit + "\n")

    # write output fasta
    outname = query_fasta.replace(".pep.fa", "_swipe.fa")
    print "Writing output fasta", outname
    outfile = open(outname, "w")
    query_seqids = []  # avoid seq id repeats
    with open(query_fasta, "r") as infile:
        for line in infile:
            outfile.write(line)  # copy over query seqs
            if line[0] == ">":
                query_seqids.append(line.strip()[1:])
    for i in os.listdir(DIR):
        if i.endswith(".pep.fa") or i.endswith(".cdhit"):
            seqDICT = {}  # key is seq name, value is seq
            for s in seq.read_fasta_file(DIR + i):
                seqDICT[s.name] = s.seq
            with open(
                    DIR + i + "." +
                    get_filename_from_path(query_fasta).split(".")[0] +
                    ".swipe.hits", "r") as infile:
                for line in infile:
                    line = line.strip()
                    if len(line) > 0 and line not in query_seqids:
                        outfile.write(">" + line + "\n" + seqDICT[line] + "\n")
    outfile.close()
コード例 #23
0
                    if hit not in out:
                        out.append(hit)
                    if len(out) == max_num_hits: break
                if len(out) == 0: print "Warning: No hits found"
                with open(swipe_outname + ".hits", "w") as outfile:
                    for hit in out:
                        print hit
                        outfile.write(hit + "\n")

        # write output summary files .rawswipe, .filetered_hits, .swipe.fa
        print "Writing output fasta"
        outfile1 = open(outdir + gene_name + ".rawswipe", "w")
        outfile2 = open(outdir + gene_name + ".filetered_hits", "w")
        outfile3 = open(outdir + gene_name + ".swipe.fa", "w")
        seqids_written = []  # avoid seq id repeats
        for s in read_fasta_file(query_fasta):
            seqid, seq = str(s.name), str(s.seq)
            if seqid not in seqids_written:
                outfile3.write(">" + seqid + "\n" + seq + "\n")
                seqids_written.append(seqid)
        for s in read_fasta_file(existing_homolog):
            seqid, seq = str(s.name), str(s.seq)
            if seqid not in seqids_written:
                outfile3.write(">" + seqid + "\n" + seq + "\n")
                seqids_written.append(seqid)
        for i in pepfiles:
            # write the gene_name.rawswipe file to outdir
            with open(pepdir + i + "." + gene_name + ".swipe", "r") as infile:
                for line in infile:
                    outfile1.write(line)
            seqDICT = {
コード例 #24
0
def concatenate(clnDIR, numofsitesFilter, numoftaxaFilter, seqtype, outname):
    """filter cleaned alignments and concatenate"""
    if clnDIR[-1] != "/": clnDIR += "/"
    sites_filter = int(numofsitesFilter)
    taxa_filter = int(numoftaxaFilter)
    assert seqtype == "aa" or seqtype == "dna", "Input data type: dna or aa"
    model = "AUTO" if seqtype == "aa" else "DNA"

    print "Filtering ortholog matrixes"
    selected = []  # list of alignment file names that pass the filters
    for i in os.listdir(clnDIR):
        if i.endswith(".aln-cln"):
            seqlist = read_fasta_file(clnDIR + i)
            num_seq = len(seqlist)
            num_col = len(seqlist[0].seq)
            if num_seq >= taxa_filter and num_col >= sites_filter:
                selected.append(i)
    print len(selected), "matrices passed the filter"

    print "Getting matrix occupancy stats"
    taxon_occupancy = {}
    #key is taxon name, value is [times present in a matrix,total length for this taxon]
    total_aligned_len = 0  #record how long the final concatenated matrix is
    if seqtype == "aa":
        cmd = "phyutility -concat -aa -out " + outname + ".nex -in "
    else:
        cmd = "phyutility -concat -out " + outname + ".nex -in "
    for i in selected:
        cmd += clnDIR + i + " "
        seqlist = read_fasta_file(clnDIR + i)
        total_aligned_len += len(seqlist[0].seq)
        for s in seqlist:
            taxonid = get_name(s.name)
            if taxonid not in taxon_occupancy:
                taxon_occupancy[taxonid] = [0, 0]
            taxon_occupancy[taxonid][0] += 1
            taxon_occupancy[taxonid][1] += len(
                (s.seq.replace("-", "")).replace("?", ""))
    cmd += "\n"

    total_ortho = len(selected)
    with open(outname + "_taxon_occupancy_stats", "w") as outfile:
        outfile.write(
            "taxon\t#orthologs\t#total_charactors\tperc_orthologs\tperc_charactors\n"
        )
        sum_char = 0
        for taxon in taxon_occupancy:
            times, chars = taxon_occupancy[taxon][0], taxon_occupancy[taxon][1]
            sum_char += chars
            out = taxon + "\t" + str(times) + "\t" + str(chars) + "\t"
            out += str(times / float(total_ortho)) + "\t" + str(
                chars / float(total_aligned_len)) + "\n"
            outfile.write(out)
        total_taxa = len(taxon_occupancy)
        out = "\nSupermatrix dimension " + str(total_taxa) + " taxa, "
        out += str(total_ortho) + " loci and " + str(
            total_aligned_len) + " aligned columns\n"
        out += "Overall matrix occupancy " + str(
            sum_char / float(total_taxa * total_aligned_len)) + "\n"
        outfile.write(out)

    print "Supermatrix taxon occupancy stats written to", outname + "_taxon_occupancy_stats"
    print "Waiting for concatenation to finish. This may take several minutes..."
    with open(outname + ".temp.sh", "w") as f:
        f.write(cmd)
    os.system(cmd)

    #convert the .nex file to .fasta and .model files for raxml
    infile = open(outname + ".nex", "r")
    outfile = open(outname + ".phy", "w")
    for line in infile:
        line = line.strip()
        if len(line) < 10: continue
        if line[0] == "#" or line[:
                                  5] == "BEGIN" or line[:
                                                        6] == "MATRIX" or line == "END;" or line[:
                                                                                                 6] == "FORMAT":
            continue
        if line[0] == "[":
            line = line.replace("[", model + ",")
            line = line.replace(" ]", "")
            line = line.replace(" cluster", "\n" + model + ",cluster")
            #line = line.replace(" homolog","\n"+model+",homolog")
            line = line.replace(" ", "=")
            with open(outname + ".model", "w") as outfile2:
                outfile2.write(line.strip() + "\n")
                #make sure that wc -l will get how many partitions
        elif line[:10] == "DIMENSIONS":
            ntax = (line.split("NTAX=")[1]).split(" ")[0]
            nchar = (line.split("NCHAR=")[1]).replace(";", "")
            outfile.write(ntax + " " + nchar + "\n")
        else:
            spls = line.split("\t")
            outfile.write(spls[0] + " " + spls[1] + "\n")
    infile.close()
    outfile.close()
    assert os.path.exists(outname + ".phy"), "error concatenate"
    os.system("rm " + outname + ".temp.sh")
    print "outfiles written", outname + ".phy", outname + ".model"
    os.system("rm " + outname + ".nex")  #remove intermediate .nex file