def main():
    options = interface()
    array_alignments = make_align_list(options.input_dir, options.input_file_format)
    
    # DO BOOTSTRAPPING AND WRITE REPLICATES TO OUTFILES
    # options.bootstrap_reps += 1
    boot_alignments = phylo.bootstrap(array_alignments, options.bootstrap_reps)
    bootstrapped_datasets = []
    for rep_count, boot_rep in enumerate(boot_alignments):
        boot_bases = []
        
        # setup outfile names (make into function?)
        if options.output_file_format == 'nexus':
            fname = 'bootrep_%s.nex' % rep_count
        if options.output_file_format == 'phylip':
            fname = 'bootrep_%s.phylip' % rep_count
        final_path = os.path.join(options.output_dir,fname)
        fout = open(final_path,'a')
    
        for count, align in enumerate(boot_rep):
            seqs = copy(align[0])                               # lots of copying to be 'safe'
            ids = copy(align[1])
            bases_by_col = np.column_stack(seqs)                # flip rows and columns
            bs_bases = phylo.bootstrap(bases_by_col, 1)         # bootstrap the bases within the bootstrapped alignments
            bs_bases = np.column_stack(bs_bases[0])             # [0] corrects weirdnesss due to extra set of brackets
            bs_bases = bs_bases.copy()                          # copy modified replicate
            pair = [bs_bases, ids]                              
            biopy_align = [strarray2biopy(pair)]
            AlignIO.write(biopy_align, fout, options.output_file_format)
            fout.write('\n')
        
        fout.close()
Beispiel #2
0
def emboss_piped_AlignIO_convert(alignments, old_format, new_format):
    """Run seqret, returns alignments (as a generator)."""
    # Setup, this assumes for all the format names used
    # Biopython and EMBOSS names are consistent!
    cline = SeqretCommandline(exes["seqret"],
                              sformat=old_format,
                              osformat=new_format,
                              auto=True,  # no prompting
                              filter=True)
    # Run the tool,
    child = subprocess.Popen(str(cline),
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             universal_newlines=True,
                             shell=(sys.platform != "win32"))
    try:
        AlignIO.write(alignments, child.stdin, old_format)
    except Exception as err:
        child.stdin.close()
        child.stderr.close()
        child.stdout.close()
        raise
    child.stdin.close()
    child.stderr.close()
    # TODO - Is there a nice way to return an iterator AND
    # automatically close the handle?
    try:
        aligns = list(AlignIO.parse(child.stdout, new_format))
    except Exception as err:
        child.stdout.close()
        raise
    child.stdout.close()
    return aligns
def taxit_create(taxit_executable_loc,
                aln_fasta,
                hmm_file,
                tree_file,
                tree_stats,
                pfam_acc,
                output_location,
                aln_stockholm):
    '''
    Calls taxit
    '''
    #taxit create --clobber --aln-fasta ./PF14424.dedup.fasta --profile ./PF14424.wholefam.hmm --tree-file ./PF14424.dedup.nh  --locus PF14424 --package-name PF14424.pplacer
    cmd = taxit_executable_loc \
        + " create --clobber" \
        + " --aln-fasta " + aln_fasta \
        + " --profile " + hmm_file \
        + " --tree-file " + tree_file \
        + " --tree-stats " + tree_stats \
        + " --locus " + pfam_acc \
        + " --package-name " + output_location
    raw_data = subprocess.check_call(cmd, shell=True)   
    input_handle = open(aln_fasta, "rU")
    output_handle = open(aln_stockholm, "w")
    alignments = AlignIO.parse(input_handle, "fasta")
    AlignIO.write(alignments, output_handle, "stockholm")
    output_handle.close()
    input_handle.close()
Beispiel #4
0
    def build(self, root='midpoint', raxml=True, raxml_time_limit=0.5):
        from Bio import Phylo, AlignIO
        import subprocess, glob, shutil
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        for seq in self.aln: seq.name=seq.id
        AlignIO.write(self.aln, 'temp.fasta', 'fasta')

        tree_cmd = ["fasttree"]
        if self.nuc: tree_cmd.append("-nt")
        tree_cmd.append("temp.fasta")
        tree_cmd.append(">")
        tree_cmd.append("initial_tree.newick")
        os.system(" ".join(tree_cmd))

        out_fname = "tree_infer.newick"
        if raxml:
            if raxml_time_limit>0:
                tmp_tree = Phylo.read('initial_tree.newick','newick')
                resolve_iter = 0
                resolve_polytomies(tmp_tree)
                while (not tmp_tree.is_bifurcating()) and (resolve_iter<10):
                    resolve_iter+=1
                    resolve_polytomies(tmp_tree)
                Phylo.write(tmp_tree,'initial_tree.newick', 'newick')
                AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed")
                print( "RAxML tree optimization with time limit", raxml_time_limit,  "hours")
                # using exec to be able to kill process
                end_time = time.time() + int(raxml_time_limit*3600)
                process = subprocess.Popen("exec raxml -f d -T " + str(self.nthreads) + " -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True)
                while (time.time() < end_time):
                    if os.path.isfile('RAxML_result.topology'):
                        break
                    time.sleep(10)
                process.terminate()

                checkpoint_files = glob.glob("RAxML_checkpoint*")
                if os.path.isfile('RAxML_result.topology'):
                    checkpoint_files.append('RAxML_result.topology')
                if len(checkpoint_files) > 0:
                    last_tree_file = checkpoint_files[-1]
                    shutil.copy(last_tree_file, 'raxml_tree.newick')
                else:
                    shutil.copy("initial_tree.newick", 'raxml_tree.newick')
            else:
                shutil.copy("initial_tree.newick", 'raxml_tree.newick')

            try:
                print("RAxML branch length optimization")
                os.system("raxml -f e -T " + str(self.nthreads) + " -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick")
                shutil.copy('RAxML_result.branches', out_fname)
            except:
                print("RAxML branch length optimization failed")
                shutil.copy('raxml_tree.newick', out_fname)
        else:
            shutil.copy('initial_tree.newick', out_fname)
        self.tt_from_file(out_fname, root)
        os.chdir('..')
        remove_dir(self.run_dir)
        self.is_timetree=False
Beispiel #5
0
def AlignClean(file, out):
    global countN, total_len
    #search pattern
    match = re.compile(r'(N)\1*')
    #create list to append start/stop to
    Ns = []
    handle = open(file, 'rU')
    outhandle = open(out, 'w')
    alignment = AlignIO.read(handle, 'fasta')
    for rec in alignment:
        total_len = len(rec.seq)
        string = str(rec.seq).upper()
        Seq = re.sub('N[-]*N', repl, string) #replace gaps between N's with N's for the next regex step
        for m in match.finditer(Seq):
            Ns.append( [m.start(),m.end()] )
    Ns.sort(key=lambda x: x[0])
    #now run the combinelist function as many times as necessary
    run1 = combinelists(Ns)
    flat = flatten(run1)
    flat.insert(0,0)
    flat.append(total_len)
    final = zip(*[iter(flat)] * 2)
    test = []
    for i in range(len(final)):
        cmd = "alignment[:, %i:%i]" % (final[i][0], final[i][1])
        test.append(cmd)

    edited = ' + '.join(test)
    AlignIO.write(eval(edited), outhandle, 'fasta')
    handle.close()
    countN = len(run1)
def main():
    args = get_args()
    # iterate through all the files to determine the longest alignment
    files = get_files(args.nexus)
    old_names = set()
    for f in files:
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                old_names.update([seq.name])
    #pdb.set_trace()
    name_map = abbreviator(old_names)
    for count, f in enumerate(files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        #filename = os.path.basename(f)
        #chromo_name = filename.split('.')[0]
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                new_seq_name = name_map[seq.name]
                new_align.add_sequence(new_seq_name, str(seq.seq))
        #pdb.set_trace()
        outf = os.path.join(args.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        except ValueError:
            pdb.set_trace()
        print count
Beispiel #7
0
def mcmc(alignment, num_imp, dem_ratios, directory, length, burnin):
	acceptances = 0
	# Build first state of Markov chain
	print 'Imputing first alignment...'
	current = impute.imp_align(num_imp, alignment, dem_ratios)
	current.loglik = loglik(current)
	print '\t Log likelihood %2f' % current.loglik
	if not burnin: AlignIO.write(current, '%s/%d.fasta' % (directory,0), 'fasta')
	# Run chain
	for i in xrange(1,length+1):
		proposal = impute.imp_align(1, minoneimp(current, num_imp), dem_ratios)
		proposal.loglik = loglik(proposal)
		p = proposal.loglik-current.loglik
		print 'Current LLH: %2f; Proposed LLH: %2f' % (current.loglik, proposal.loglik)
		print '\tAcceptance probability %e' % math.exp(p)
		if p>0:
			current = proposal
			acceptances += 1
			print '\tAccepted'
		elif random.random()<math.exp(p):
			current = proposal
			acceptances += 1
			print '\tAccepted'
		else: print '\tNot accepted'
		if i > burnin:
			AlignIO.write(current, '%s/%d.fasta' % (directory,i-burnin), 'fasta')
	return float(acceptances)/length
Beispiel #8
0
def mcmc_ks(alignment, num_imp, dem_ratios, directory, length, burnin):
	acceptances = 0
	d = transprobs(TRANSITIONS, MARGINAL)
	pd = pdn(alignment)
	mins = np.array([sorted(i)[1] for i in pd])
	# Build first state of Markov chain
	print 'Imputing first alignment...'
	start = impute.imp_align(num_imp, alignment, dem_ratios)
	current = deepcopy(start)
	current.loglik = loglik(current)+math.log(dist_ks(current, num_imp, mins, 1000))
	print '\t Log likelihood %2f' % current.loglik
	if not burnin: AlignIO.write(current, '%s/%d.fasta' % (directory,0), 'fasta')
	# Run chain
	for i in xrange(1,length+1):
		proposal = propose(current,num_imp,max(norm(loc=2,scale=1).rvs(),1), d)
		l1 = loglik(proposal)
		l2 = math.log(dist_ks(proposal, num_imp, mins, 1000))
		proposal.loglik = l1+l2
		p = proposal.loglik-current.loglik
		print 'Current LLH: %2f; Proposed LLH: %2f' % (current.loglik, proposal.loglik)
		print '\tPhylogeny component: %2f; Distance component: %2f' % (l1, l2)
		print '\tAcceptance probability %e' % math.exp(p)
		if random.random()<math.exp(p):
			current = proposal
			acceptances += 1
			print '\tAccepted'
		else: print '\tNot accepted'
		if i > burnin:
			AlignIO.write(current, '%s/%d.fasta' % (directory,i-burnin), 'fasta')
	return float(acceptances)/length, start
Beispiel #9
0
def filter_alignment(args):
	filein = args.input
	fileout = args.output
	filetype = args.type
	outtype = args.outtype
	variablesites = args.variable
	skip = args.skip

	seqs = AlignIO.read(filein,filetype)
	newalignment= [''] * len(seqs)
	for i in range(0,seqs.get_alignment_length()):
		baselist = list(seqs[:,i])
		if not is_biallelic(baselist): continue
		#Since we have a maximum of 1 mutation, at ambiguous sites we have a constant site and a variable site
		c, v = diploidify(baselist) #turn baselist into 2 baselists, expanding using IUPAC notation
		# if not is_single_mutation(c,v): continue
		
		# if variablesites:
		# 	c = remove_nonvariable_sites(c)
		# 	v = remove_nonvariable_sites(v)
		# 	if c == v == [''] * len(c): continue
		c, v = filter_cv(c,v,args)
		if c == v == None: continue		
		combined = list()
		if skip:
			combined = baselist
		else:
			combined = combine_cv(c,v)
		newalignment = [newalignment[j] + combined[j] for j in range(0,len(combined))]

	newseqobjs = [SeqRecord(Seq(newalignment[l], IUPAC.unambiguous_dna), id=seqs[l].id, description='') for l in range(0,len(seqs))]
	newalnobj = MultipleSeqAlignment(newseqobjs)
	newalnobj = remove_duplicate_seqs(newalnobj)
	AlignIO.write(newalnobj,fileout,outtype)
Beispiel #10
0
def biolikplot(alignment, num_imp, dem_ratios, length, threshold):
	acceptances = 0
	seq_len = len(alignment[0])
	al_len = len(alignment)
	clusters, logliks = [], []
	d = transprobs(TRANSITIONS, MARGINAL)
	
	#Get statistics for input alignment
	pd = pdn(alignment)
	mins = np.array(sorted([sorted(i)[1] for i in pd]))
	clusters.append(clustering(seq_len, mins, threshold))
	logliks.append(loglik(alignment))
	print 'Original alignment (len %dx%d) has clustering %.2f and LLH %2f' % (len(alignment), len(alignment[0]), clusters[-1], logliks[-1])
	
	#Delete some sequences so we can re-impute for xval
	alignment = MultipleSeqAlignment(random.sample(alignment,len(alignment)-num_imp))
	#Get statistics for "deletions" alignment
	pd = pdn(alignment)
	mins = np.array(sorted([sorted(i)[1] for i in pd]))
	clusters.append(clustering(seq_len, mins, threshold))
	logliks.append(loglik(alignment))
	print 'Deleted alignment (len %dx%d) has clustering %.2f and LLH %2f' % (len(alignment), len(alignment[0]), clusters[-1], logliks[-1]) 

	pssm = SummaryInfo(alignment).pos_specific_score_matrix()
	probs = 1-np.array([max(pssm[i].values()) for i in xrange(seq_len)])/al_len	#Weight site selection by empirical probability of mutation at that site
	probs /= sum(probs)
	
	# Build first state of Markov chain
	print 'Imputing first alignment...'
	current = impute.imp_align(num_imp, alignment, dem_ratios)
	current.loglik = loglik(current)
	current.distarray = np.array([list(s.seq) for s in current])
	current.pd = pdn(current)
	curmins = np.array(sorted([sorted(i)[1] for i in current.pd]))
	clusters.append(clustering(seq_len, curmins, threshold))
	logliks.append(current.loglik)
	print '\t Log likelihood %2f' % current.loglik
#	if not burnin: AlignIO.write(current, '%s/%d.fasta' % (directory,0), 'fasta')
	# Run chain
	for i in xrange(1,length):
		proposal = propmat(current,num_imp,max(norm(loc=2,scale=1).rvs(),1), d, probs)[1]
		proposal.loglik = loglik(proposal)
		for m,n in itertools.product(range(proposal.pd.shape[0]), range(proposal.pd.shape[1])):
			if (proposal.pd[m][n] < 10) and m!=n: proposal.loglik = -sys.maxint-1; print m,n, proposal.pd[m][n]
		p = proposal.loglik-current.loglik
		print 'Current LLH: %2f; Proposed LLH: %2f; Acceptance probability %e' % (current.loglik, proposal.loglik, math.exp(p))
		if random.random()<math.exp(p):
			current = proposal
			acceptances += 1
			print '\tAccepted'
		else: print '\tNot accepted'
		curmins = np.array(sorted([sorted(i)[1] for i in current.pd]))
		clusters.append(clustering(seq_len, curmins, threshold))
		logliks.append(current.loglik)
#		if i > burnin:
#			AlignIO.write(current, '%s/%d.fasta' % (directory,i-burnin), 'fasta')
	r=random.randint(0,1000000)
	print r
	AlignIO.write(current, '%d.fasta'%r, 'fasta')
	return np.vstack((logliks,clusters))
Beispiel #11
0
def main():
	args = get_args()
	files = list()
	prefiles = os.listdir(args.in_dir)
	for prefile in prefiles: # Remove hidden files
		if not prefile.startswith('.'):
			files.append(prefile)
	os.chdir(args.in_dir)
	for file in files:
		print file
		alignment = AlignIO.read("{0}{1}".format(args.in_dir, file), "fasta")		
		alignment1 = alignment[:2,:]
		alignment1.append(alignment[6,:])
		alignment1.append(alignment[7,:])
		alignment1.append(alignment[8,:])
		alignment1.append(alignment[9,:])
		alignment1.append(alignment[14,:])
		alignment1.append(alignment[15,:])
		alignment2 = alignment[2:6,:]
		alignment2.append(alignment[10,:])
		alignment2.append(alignment[11,:])
		alignment2.append(alignment[12,:])
		alignment2.append(alignment[13,:])
		print alignment1
		print alignment2
		AlignIO.write(alignment1, "{0}trans_{1}".format(args.out_dir1, file), "fasta")
		AlignIO.write(alignment2, "{0}cis_{1}".format(args.out_dir2, file), "fasta")
Beispiel #12
0
  def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename,remove_identical_sequences = 0):
	  
      taxa_to_remove = []
      if remove_identical_sequences < 1:	  
          taxa_to_remove = self.taxa_missing_too_much_data()
      else:
          taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data()
      
      with open(self.input_filename) as input_handle:
          with open(output_filename, "w+") as output_handle:
              alignments = AlignIO.parse(input_handle, "fasta")
              output_alignments = []
              
              number_of_included_alignments = 0
              for alignment in alignments:
                  for record in alignment:
                      
                      if record.id not in taxa_to_remove:
                          output_alignments.append(record)
                          number_of_included_alignments += 1
              
              if number_of_included_alignments <= 1:
                  sys.exit("Not enough sequences are left after removing duplicates.Please check you input data.")
              
              AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta")
              output_handle.close()
          input_handle.close()
      return taxa_to_remove
def main():
    alignment = AlignIO.read(infile, 'fasta')
    new_align_list = list()
    removed_list = list()
    for record in alignment:
        flag=True
        flag = leftGapOperations(record)
        if flag != True:
            removed_list.append(record.id)
            #print "Removing %s from alignment due to \nexceeding left gap cutoff" % record.id
        else:
            flag = rightGapOperations(record)
            if flag == False:
                removed_list.append(record.id)
                #print "Removing %s from alignment due to \nexceeding right gap cutoff" % record.id
            else:
                new_align_list.append(record)
    
    removed_outfile_name = outfile.split('.')[0] + ".rem"
    removed_handle = open(removed_outfile_name, 'w')
    removed_handle.write('\n'.join(removed_list))
    removed_handle.close()
    new_align = MultipleSeqAlignment(new_align_list, alphabet=IUPAC.extended_dna)
    #print new_align
    #print getLeftTerminalCutoff()
    #print getRightTerminalCutoff()
    trim_align = trimSelection(new_align)
    print "Trimmed %i left and %i right" % (getLeftTerminalCutoff(),getRightTerminalCutoff()*-1)
    print "Removed %i sequences due to exceeding gap limits" % (len(removed_list))
    AlignIO.write(trim_align, outfile, 'fasta')
Beispiel #14
0
def convert(infile, type, outtype, outfile):
	"""Make AlignIO call to convert using the specified parameters"""

	from Bio import AlignIO

	ifh = AlignIO.parse(infile, type)
	AlignIO.write(ifh, outfile, outtype)
Beispiel #15
0
 def constructor(self, recalculate):
     f = global_stuff.the_file_manager.get_file_handle(pdb_chain_msa_file_wrapper(self.params), recalculate)
     msa = AlignIO.read(f.name, 'fasta')
     # search for the query sequence
     idx = -1
     for i in range(len(msa)):
         if msa[i].id == 'QUERY':
             idx = i
             break
     #print 'AAAAAAAAAAAAAAAAAAAAAAAAAAAA', idx
     #pdb.set_trace()
     # find the first non-insertion column
     i = 0
     while msa[idx,i] == '-':
         #print msa[idx,i]
         i = i + 1
         #print idx, i
     to_return = msa[:,i:(i+1)]
     print 'EEEEEEEEEEEEEEE'
     # add in all the other columns
     for k in range(i+1, msa.get_alignment_length()):
         if msa[idx,k] != '-':
             #print k
             to_return = to_return + msa[:,k:(k+1)]
     AlignIO.write(to_return, open(self.get_file_location(),'w'), 'fasta')
Beispiel #16
0
 def save(cls, alignments, filename, schema=None):
     try: 
         AlignIO.write(alignments, filename, cls.schema(filename, schema))
         return True
     except Exception, e:
         print 'Unable to save alignments to: %s\n%s' % (filename, str(e))
         return False
Beispiel #17
0
 def write_alignment(self, filename, file_format, interleaved=None):
     """
     Write the alignment to file using Bio.AlignIO
     """
     if file_format == 'phylip':
         file_format = 'phylip-relaxed'
     AlignIO.write(self._msa, filename, file_format)
Beispiel #18
0
def main():
   indexfile = open('indexfile.txt','r')
   for line in indexfile:
      files = line.split()
#      print "Seqfile name= ",files[0]," and aln file= ",files[1]
      seqs = SeqIO.to_dict(SeqIO.parse(files[0],'fasta'))
#      print "seqs = "+str(seqs)
      align = AlignIO.read(files[1],'clustal')
#      print "align= "+str(align)
      seqnames = seqs.keys()
#      print "seqnames = "+str(seqnames)
      name_idx ={}
      for s in seqnames:
#         n = s.split()
#         print "s = ",s," and full desc= ",seqs[s].description
         name_idx[s] = seqs[s].description
#      print "name_idx = "+str(name_idx)
      aln_dict = {}
      for x in range(0,len(align)):
         aln_dict[align[x].id] = x
#      print "aln_dict = "+str(aln_dict)
      for sname in name_idx:
#         print "sname = ",sname
         if aln_dict.has_key(sname): align[aln_dict[sname]].id = name_idx[sname]
      
#      print "new align should be "+str(align)
      newalign = open('new_'+files[1],"w")
      AlignIO.write(align,newalign,'clustal')
      newalign.close()
Beispiel #19
0
def tree(alignment,
         run_id = 'T%05i' % (0,),
         bionj = False):

  old_cwd = os.getcwd()
  new_wd = config.dataPath('phyml')
  if not os.path.isdir(new_wd): os.mkdir(new_wd)
  os.chdir(new_wd)

  infilepath = 'infile{0}'.format(run_id)
  infile = open(infilepath,'w')
  aio.write(alignment, infile, 'phylip')
  infile.close()


  command = 'phyml --quiet -i {0} -o {1} '.format(infilepath, 'n' if bionj else 'tlr' )
  print command
  subprocess.call(command,
                  shell = True,
                  stdout = subprocess.PIPE)
  treefilepath = infilepath + '_phyml_tree.txt'
  treefile = open(treefilepath)
  tree =phylo.read(treefile, 'newick')
  treefile.close()
  os.chdir(old_cwd)
  return tree
Beispiel #20
0
    def get_newick_tree(self):
        temp = None

        # quicktree expects a stockholm format input file
        if self.local_file.name and self.format == "stockholm":
            fname = self.local_file.path
        else:
            temp = tempfile.NamedTemporaryFile()
            print "writing stockholm format file..."
            AlignIO.write([self.biopy_alignment], temp, "stockholm")
            temp.flush()
            fname = temp.name

        print "opening quicktree on stockholm format file %s" % fname
        quicktree_out = os.popen('quicktree %s' % fname)   # subprocess.Popen hangs the Django dev server

        # there should be some elementary error checking here...
        newick_tree = quicktree_out.read()
        print "quicktree finished"

        if temp:
            # 'temp' is unlinked immediately after creation--so be sure to close it only after we're certain
            # that quicktree succesfully opened it (i.e, only after read(), not just after popen())
            temp.close()

        return newick_tree
def main():
    args = get_args()
    # iterate through all the files to determine the longest alignment
    files = get_files(args.input)
    all_taxa = set([])
    for count, f in enumerate(files):
        #new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        new_align = MultipleSeqAlignment([], generic_dna)
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                #pdb.set_trace()
                fname = os.path.splitext(os.path.basename(f))[0]
                new_seq_name = re.sub("^{}_*".format(fname), "", seq.name)
                all_taxa.add(new_seq_name)
                seq.id = new_seq_name
                seq.name = new_seq_name
                new_align.append(seq)
        assert len(all_taxa) == args.taxa, "Taxon names are not identical"
        outf = os.path.join(args.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        except ValueError:
            pdb.set_trace()
        print count
    print "Taxon names in alignments: {0}".format(','.join(list(all_taxa)))
Beispiel #22
0
def main(args):

    with open(args.fasta, 'r') as handle:
        align = AlignIO.read(handle, "fasta")

    to_delete = []
    old_length = align.get_alignment_length()
    logging.info('Examining {} columns of aligned fasta file'.format(old_length))
    for pos in range(old_length):
        column = align[ : , pos]
        if column == '-' * len(column):
            to_delete.append(pos)

    if len(to_delete) > 0:
        logging.info('Removing {} gap-only columns: {}'.format(len(to_delete), to_delete))
        to_delete.sort()
        to_delete.reverse()
        for pos in to_delete:
            align = align[:, :pos] + align[:, pos+1:]
        new_length = align.get_alignment_length()
        logging.info('Done! Old length: {}  New length: {}  Difference: {}'.
                     format(old_length, new_length, old_length-new_length))

    output_filename = os.path.basename(args.fasta) + '_degapped.fasta'
    with open(output_filename, 'w') as handle:
        AlignIO.write(align, handle, "fasta")
Beispiel #23
0
def add(alignment, sequence, timeout, logger, wd, threads):
    """Align sequence(s) to an alignment using mafft (external
program)"""
    alignment_file = "alignment_in.fasta"
    sequence_file = "sequence_in.fasta"
    output_file = "alignment_out.fasta" + ".fasta"
    command_line = "{0} --auto --thread {1} --add {2} {3} > {4}".format(
        mafft, threads, sequence_file, alignment_file, output_file
    )
    with open(os.path.join(wd, sequence_file), "w") as file:
        SeqIO.write(sequence, file, "fasta")
    with open(os.path.join(wd, alignment_file), "w") as file:
        AlignIO.write(alignment, file, "fasta")
    pipe = TerminationPipe(command_line, timeout=timeout, cwd=wd)
    pipe.run()
    os.remove(os.path.join(wd, alignment_file))
    os.remove(os.path.join(wd, sequence_file))
    if not pipe.failure:
        try:
            res = AlignIO.read(os.path.join(wd, output_file), "fasta")
        except:
            logger.info(pipe.output)
            raise MafftError()
        else:
            os.remove(os.path.join(wd, output_file))
    else:
        logger.debug(".... add timeout ....")
        return genNonAlignment(len(alignment) + 1, len(alignment.get_alignment_length()))
    return res
Beispiel #24
0
def split_family_seqs():
    alis_dir = cfg.dataPath('rfam/family_alis/')
    meta_dir = cfg.dataPath('rfam/family_metas/')

    fopen = open(cfg.dataPath('rfam/Rfam.seed'))
    alis = aio.parse(fopen,'stockholm')
    while 1:
        infos = {}
        start = fopen.tell()
        while 1:
            l = fopen.readline()       
            if l == '': break
            if l[0] == '#':
                ukey = str(l[5:7])
                infos.update( [(ukey, infos.get(ukey,'') + l[8:])])
            
            else:
                if l.strip() != '': break
        
        
        fopen.seek(start)
        ali = alis.next()
        if not ali:
            break
        rfname = infos['AC'].strip()
        alifile = open(os.path.join(alis_dir, rfname+'.fa'),'w')
        metafile = open(os.path.join(meta_dir, rfname+'.pickle'),'w')

        aio.write(ali, alifile, 'fasta')
        pickle.dump(infos, metafile)

        alifile.close()
        metafile.close()
Beispiel #25
0
def load_tree(seqfname):
    """Load an alignment, build & prep a tree, return the tree object."""
    if seqfname.endswith('.aln'):
        aln = AlignIO.read(seqfname, 'clustal')
    elif seqfname.endswith('.fasta'):
        # Run MAFFT quickly
        alndata = subprocess.check_output(['mafft', '--quiet', '--auto',
                                           seqfname])
        aln = AlignIO.read(StringIO(alndata), 'fasta')
    else:
        raise ValueError("Input sequences must be a Clustal alignment (.aln) "
                         "or unaligned FASTA (.fasta)")

    # Use conserved (less-gappy) blocks to build the tree
    aln = alnutils.blocks(aln, 0.4)
    with tempfile.NamedTemporaryFile(mode='w') as tmp:
        AlignIO.write(aln, tmp, 'fasta')
        tmp.flush()
        treedata = subprocess.check_output(['fasttree',
                                            '-pseudo', '-gamma', '-wag',
                                            tmp.name])
    tree = Phylo.read(StringIO(treedata), 'newick')

    # Collapse weakly supported splits
    confs = [c.confidence
             for c in tree.find_clades()
             if c.confidence is not None]
    # ENH: accept min_confidence as an option
    min_confidence = math.fsum(confs) / len(confs)
    tree.collapse_all(lambda c: c.confidence < min_confidence)
    tree.ladderize(reverse=True)
    tree.root.branch_length = 0.0
    return tree
Beispiel #26
0
def build_ml_raxml(alignment, outfile, work_dir=".", **kwargs):
    """
    build maximum likelihood tree of DNA seqs with RAxML
    """
    work_dir = op.join(work_dir, "work")
    mkdir(work_dir)
    phy_file = op.join(work_dir, "aln.phy")
    AlignIO.write(alignment, file(phy_file, "w"), "phylip-relaxed")

    raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work"))
    mkdir(raxml_work)
    raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \
        sequences=phy_file, algorithm="a", model="GTRGAMMA", \
        parsimony_seed=12345, rapid_bootstrap_seed=12345, \
        num_replicates=100, name="aln", \
        working_dir=raxml_work, **kwargs)

    logging.debug("Building ML tree using RAxML: %s" % raxml_cl)
    stdout, stderr = raxml_cl()

    tree_file = "{0}/RAxML_bipartitions.aln".format(raxml_work)
    if not op.exists(tree_file):
        print("***RAxML failed.", file=sys.stderr)
        sh("rm -rf %s" % raxml_work, log=False)
        return None
    sh("cp {0} {1}".format(tree_file, outfile), log=False)

    logging.debug("ML tree printed to %s" % outfile)
    sh("rm -rf %s" % raxml_work)

    return outfile, phy_file
Beispiel #27
0
def writing(seqs,seq_descs,seq_ids, filename): #Arguments are sequence, description, ids, filename

	
	outdir = sys.argv[3] 					#Output directory
	if os.path.isdir(outdir):				#Checks the presence of directory
		print "Directory exists. New directory not created"
	else: 
		command= "mkdir "+ outdir 
		os.system(command)
								#outpath defines path of the subfolder we want to store results in 
			        
	outpath = outdir + '/' + sys.argv[1]
	command = "mkdir " + outpath
	os.system(command)

								#write the result to output
        align = MultipleSeqAlignment([])
	output_file = outpath + '/' + filename + '.' + 'output'
	#print output_file
								#path = outdir + '/'+ output_file
        
	for i in range(len(seqs)):
                align.append(SeqRecord(Seq(seqs[i],generic_protein),id=seq_ids[i],description=seq_descs[i]))
                
        AlignIO.write(align, output_file ,"fasta")
Beispiel #28
0
    def __init__(self,aln,treef,cmd=None):
        if os.path.isfile(aln):
            self.alnfile=aln
            self.aln = AlignIO.read(open(self.alnfile),'fasta')
        else:
            self.aln=aln
            self.alnfile = tempfile.NamedTemporaryFile()
            AlignIO.write(aln,self.alnfile,'fasta')
            self.alnfile.flush()
        if not cmd:
            import sys
            if sys.maxint==9223372036854775807: #64 bit
                cmd='rate4site64'
            else:
                cmd='rate4site'
        if isinstance(treef,dendropy.Tree):
            parent_tree=treef
        elif os.path.isfile(treef):
            parent_tree=dendropy.Tree.get_from_path(treef,'newick')
        self.tree = narrow_tree(parent_tree,self.aln)
        self.treefile = tempfile.NamedTemporaryFile()
#        self.tree.write(self.treefile,'newick',internal_labels=False)
        self.treefile.write(self.tree.as_string('newick',internal_labels=False)[5:])
        self.treefile.flush()
        self.cmd=cmd
Beispiel #29
0
  def filter_out_alignments_with_too_much_missing_data(input_filename, output_filename, filter_percentage,verbose):
    input_handle  = open(input_filename, "rU")
    output_handle = open(output_filename, "w+")
    alignments = AlignIO.parse(input_handle, "fasta")
    output_alignments = []
    taxa_removed = []
    number_of_included_alignments = 0
    for alignment in alignments:
        for record in alignment:
          number_of_gaps = 0
          number_of_gaps += record.seq.count('n')
          number_of_gaps += record.seq.count('N')
          number_of_gaps += record.seq.count('-')
          sequence_length = len(record.seq)

          if sequence_length == 0:
            taxa_removed.append(record.id)
            print "Excluded sequence " + record.id + " because there werent enough bases in it"
          elif((number_of_gaps*100/sequence_length) <= filter_percentage):
            output_alignments.append(record)
            number_of_included_alignments += 1
          else:
            taxa_removed.append(record.id)
            print "Excluded sequence " + record.id + " because it had " + str(number_of_gaps*100/sequence_length) +" percentage gaps while a maximum of "+ str(filter_percentage) +" is allowed"

    if number_of_included_alignments <= 1:
      sys.exit("Too many sequences have been excluded so theres no data left to work with. Please increase the -f parameter")

    AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta")
    output_handle.close()
    input_handle.close()
    return taxa_removed
def read_alignment(alignment, informat, outformat, start, stop):
    align = AlignIO.read(alignment, informat, alphabet=generic_dna)
    out_basename = os.path.splitext(alignment)[0]
    algn_length = align.get_alignment_length()
    print "\nInput alignment is "+str(algn_length)+" characters."
    end_pos = stop
    if stop>algn_length:
        print "\nNB: you have requested an end position beyond the "+\
               "length of the alignment.  "
        end_pos = algn_length
    if stop<start or start<0:
        print "\nFatal: your begin and end positions need re-assessment."+\
              "  Exiting now."
        print ""
        sys.exit()
    outname = out_basename+"_pos"+str(start)+"to"+str(end_pos)+"."+outformat
    with open(outname, "w") as output_handle:
        algn = align[:, start:stop]
        AlignIO.write(algn, output_handle, outformat) 
        print "\nExtracted "+outformat+"-formatted sub-alignment from "+\
        "positions "+str(start)+" to "+str(end_pos)+" and written it to "+\
        outname+".  Here is a preview:"
        print ""
        print algn
        print ""
Beispiel #31
0
        # Expected - check the error
        assert "Repeated name 'longsequen'" in str(e)

check_phylip_reject_duplicate()


#Check parsers can cope with an empty file
for t_format in AlignIO._FormatToIterator:
     handle = StringIO()
     alignments = list(AlignIO.parse(handle, t_format))
     assert len(alignments) == 0

#Check writers can cope with no alignments
for t_format in list(AlignIO._FormatToWriter)+list(SeqIO._FormatToWriter):
     handle = StringIO()
     assert 0 == AlignIO.write([], handle, t_format), \
            "Writing no alignments to %s format should work!" \
            % t_format

#Check writers reject non-alignments
list_of_records = list(AlignIO.read(open("Clustalw/opuntia.aln"),"clustal"))
for t_format in list(AlignIO._FormatToWriter)+list(SeqIO._FormatToWriter):
    handle = StringIO()
    try:
        AlignIO.write([list_of_records], handle, t_format)
        assert False, "Writing non-alignment to %s format should fail!" \
            % t_format
    except (TypeError, AttributeError, ValueError):
        pass
    del handle
del list_of_records, t_format
clustaltextout, clustaltexterr = clustalw_cline()

if len(clustaltexterr) > 0:
    print("error:\n%s"%(clustaltexterr))
    exit()

print ("clustalw output:\n %s"%(clustaltextout))

# read in the alignment file   and create a MultipleSeqAlignment object
clustalalignment = AlignIO.read("p53_homologous.aln", "clustal")

#write the alignment to a format that can be read by PhyML

alignout_filename = "p53_homologous.out"
AlignIO.write(clustalalignment,alignout_filename,"phylip-relaxed")

print("Making tree (takes around 75 seconds): ")

# specify the location of the phyml executable (this depends on your machine)
phyml_exe_path = r"D:\SCHOOL\fall 2020\Biological Models in Python\Week 7\PhyML-3.1_win32.exe"

#optional check to see if that path exists
assert os.path.isfile(phyml_exe_path), "PhyML executable missing"

# create an instance of a Bio.AlignApplication that can be called like a function and runs phyml
phymlcmd = PhymlCommandline(cmd=phyml_exe_path,input=alignout_filename)
phymltextout,phymltexterr = phymlcmd()

print ("PhyML output:\n%s"%(phymltextout))
Beispiel #33
0
def simplex(params,
            out_prefix=None,
            yule=True,
            n_model=5,
            n_seqgen=5,
            JC=False,
            alphabet='nuc_nogap',
            alpha=1.0,
            rate_alpha=1.5,
            W_dirichlet_alpha=2.0):
    """Generate a tree and random GTR model with frequency parameters sampled
    from a Dirichlet distribution on the simplex

    Parameters
    ----------
    params : dict
        dictionary with parameters of the evolutionary process, sample size etc
    out_prefix : None, optional
        save the generated data using this prefix and otherwise standardized file names
    yule : bool, optional
        generate a Yule tree instead of a Kingman Coalesccent tree
    n_model : int, optional
        number of distinct models to draw for each tree
    n_seqgen : int, optional
        number of times sequences are evolved for each tree/model combination
    JC : bool, optional
        Use a Jukes Cantor model for the preference but include rate variation
    alphabet : str, optional
        alphabet of the GTR model
    alpha : float, optional
        parameter of the Dirichlet distribution for frequencies
    rate_alpha : float, optional
        parameter of the rate distribution (Gamma)
    W_dirichlet_alpha : float, optional
        parameter of the Dirichlet distribution of W matrix elements
    """
    from Bio import AlignIO
    # generate a model
    T = betatree(params['n'], alpha=2.0)
    T.yule = yule
    T.coalesce()
    # ladderize the tree and name internal nodes via loading into TreeAnc
    T.BioTree.ladderize()
    tt = TreeAnc(tree=T.BioTree)
    if out_prefix:
        Phylo.write(tt.tree, tree_name(out_prefix, params), 'newick')

    for mi in range(n_model):
        params['model'] = mi
        if JC:
            myGTR = GTR_site_specific.random(L=params['L'],
                                             alphabet=alphabet,
                                             pi_dirichlet_alpha=False,
                                             W_dirichlet_alpha=False,
                                             mu_gamma_alpha=rate_alpha)
        else:
            myGTR = GTR_site_specific.random(
                L=params['L'],
                alphabet=alphabet,
                pi_dirichlet_alpha=alpha,
                mu_gamma_alpha=rate_alpha,
                W_dirichlet_alpha=W_dirichlet_alpha)

        myGTR.mu *= params['m']

        if out_prefix:
            save_model(myGTR, model_name(out_prefix, params))

        for si in range(n_seqgen):
            params['seqgen'] = si
            # generate sequences
            mySeq = SeqGen(params['L'], gtr=myGTR, tree=T.BioTree)
            mySeq.evolve()

            if out_prefix:
                save_mutation_count(mySeq,
                                    mutation_count_name(out_prefix, params))
                with open(alignment_name_raw(out_prefix, params), 'wt') as fh:
                    AlignIO.write(mySeq.get_aln(), fh, 'fasta')
                reconstruct_tree(out_prefix, params, aa='aa' in alphabet)
                os.system('gzip ' + alignment_name_raw(out_prefix, params))
Beispiel #34
0
from Bio import AlignIO
import sys

input_handle = open(sys.argv[1], "rU")
output_handle = open(sys.argv[2], "w")

alignments = AlignIO.parse(input_handle, "fasta")
AlignIO.write(alignments, output_handle, "phylip")

output_handle.close()
input_handle.close()