Example #1
0
def main():
   indexfile = open('indexfile.txt','r')
   for line in indexfile:
      files = line.split()
#      print "Seqfile name= ",files[0]," and aln file= ",files[1]
      seqs = SeqIO.to_dict(SeqIO.parse(files[0],'fasta'))
#      print "seqs = "+str(seqs)
      align = AlignIO.read(files[1],'clustal')
#      print "align= "+str(align)
      seqnames = seqs.keys()
#      print "seqnames = "+str(seqnames)
      name_idx ={}
      for s in seqnames:
#         n = s.split()
#         print "s = ",s," and full desc= ",seqs[s].description
         name_idx[s] = seqs[s].description
#      print "name_idx = "+str(name_idx)
      aln_dict = {}
      for x in range(0,len(align)):
         aln_dict[align[x].id] = x
#      print "aln_dict = "+str(aln_dict)
      for sname in name_idx:
#         print "sname = ",sname
         if aln_dict.has_key(sname): align[aln_dict[sname]].id = name_idx[sname]
      
#      print "new align should be "+str(align)
      newalign = open('new_'+files[1],"w")
      AlignIO.write(align,newalign,'clustal')
      newalign.close()
Example #2
0
    def check_AlignIO_to_EMBOSS(self, in_filename, in_format, skip_formats=(),
                                alphabet=None):
        """Can Bio.AlignIO write files seqret can read back?"""
        if alphabet:
            old_aligns = list(AlignIO.parse(in_filename, in_format, alphabet))
        else:
            old_aligns = list(AlignIO.parse(in_filename, in_format))

        formats = ["clustal", "phylip"]
        if len(old_aligns) == 1:
            formats.extend(["fasta", "nexus"])
        for temp_format in formats:
            if temp_format in skip_formats:
                continue
            # PHYLIP is a simple format which explicitly supports
            # multiple alignments (unlike FASTA).
            try:
                new_aligns = list(emboss_piped_AlignIO_convert(old_aligns,
                                                               temp_format,
                                                               "phylip"))
            except ValueError as e:
                # e.g. ValueError: Need a DNA, RNA or Protein alphabet
                # from writing Nexus files...
                continue
            try:
                self.assertTrue(compare_alignments(old_aligns, new_aligns))
            except ValueError as err:
                raise ValueError("Disagree on file %s %s in %s format: %s"
                                 % (in_format, in_filename, temp_format, err))
Example #3
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == 'index':
             nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = dict((i.split()[0], i.split()[1]) for i in handle)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
         nucl.close()  # Close the indexed FASTA file
     self.alns = alns
def read_alignment(alignment, informat, outformat, start, stop):
    align = AlignIO.read(alignment, informat, alphabet=generic_dna)
    out_basename = os.path.splitext(alignment)[0]
    algn_length = align.get_alignment_length()
    print "\nInput alignment is "+str(algn_length)+" characters."
    end_pos = stop
    if stop>algn_length:
        print "\nNB: you have requested an end position beyond the "+\
               "length of the alignment.  "
        end_pos = algn_length
    if stop<start or start<0:
        print "\nFatal: your begin and end positions need re-assessment."+\
              "  Exiting now."
        print ""
        sys.exit()
    outname = out_basename+"_pos"+str(start)+"to"+str(end_pos)+"."+outformat
    with open(outname, "w") as output_handle:
        algn = align[:, start:stop]
        AlignIO.write(algn, output_handle, outformat) 
        print "\nExtracted "+outformat+"-formatted sub-alignment from "+\
        "positions "+str(start)+" to "+str(end_pos)+" and written it to "+\
        outname+".  Here is a preview:"
        print ""
        print algn
        print ""
Example #5
0
def emboss_piped_AlignIO_convert(alignments, old_format, new_format):
    """Run seqret, returns alignments (as a generator)."""
    # Setup, this assumes for all the format names used
    # Biopython and EMBOSS names are consistent!
    cline = SeqretCommandline(exes["seqret"],
                              sformat=old_format,
                              osformat=new_format,
                              auto=True,  # no prompting
                              filter=True)
    # Run the tool,
    child = subprocess.Popen(str(cline),
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             universal_newlines=True,
                             shell=(sys.platform != "win32"))
    try:
        AlignIO.write(alignments, child.stdin, old_format)
    except Exception as err:
        child.stdin.close()
        child.stderr.close()
        child.stdout.close()
        raise
    child.stdin.close()
    child.stderr.close()
    # TODO - Is there a nice way to return an iterator AND
    # automatically close the handle?
    try:
        aligns = list(AlignIO.parse(child.stdout, new_format))
    except Exception as err:
        child.stdout.close()
        raise
    child.stdout.close()
    return aligns
Example #6
0
 def save(cls, alignments, filename, schema=None):
     try: 
         AlignIO.write(alignments, filename, cls.schema(filename, schema))
         return True
     except Exception, e:
         print 'Unable to save alignments to: %s\n%s' % (filename, str(e))
         return False
def main():
    if len (sys.argv) != 4 :
        print "Please provide file, the file format, and the desired file format "
        sys.exit (1)
    else:
        f = sys.argv[1]
        fout = "".join(f.split('.')[:-1])
        formatin = sys.argv[2]
        formatout  = sys.argv[3]
        if formatout == 'nexus':
            AlignIO.convert(f,formatin,fout+'.'+formatout,formatout,alphabet= IUPAC.ambiguous_dna)
        if formatout == 'mega':
            handle = open(f, "rU")
            record_dict = SeqIO.to_dict(SeqIO.parse(handle, "phylip-relaxed"))
            handle.close()
            
            outfile = open(fout+'.'+formatout,'w')
            outfile.write('#mega'+"\n")
            outfile.write('!Title Mytitle;'+"\n")
            outfile.write('!Format DataType=DNA indel=-;'+"\n\n")
            
            for n in record_dict:
                outfile.write('#'+n+"\n")
                newseq=wrap(str(record_dict[n].seq),60)
                for s in newseq:
                    outfile.write(s+"\n")
            
            outfile.close()
        else:
            AlignIO.convert(f,formatin,fout+'.'+formatout,formatout)
Example #8
0
 def write_alignment(self, filename, file_format, interleaved=None):
     """
     Write the alignment to file using Bio.AlignIO
     """
     if file_format == 'phylip':
         file_format = 'phylip-relaxed'
     AlignIO.write(self._msa, filename, file_format)
Example #9
0
 def conversion(self, prank_number, prank_ext, format):
     """Get PRANK to do a conversion, and check it with SeqIO."""
     filename = "%s.%s" % (self.output, prank_ext)
     if os.path.isfile(filename):
         os.remove(filename)
     cmdline = PrankCommandline(prank_exe, d=self.input,
                                convert=True, f=prank_number,
                                o='"%s"' % self.output)
     self.assertEqual(str(cmdline), _escape_filename(prank_exe)
                      + ' -d=%s' % self.input
                      + ' -o="%s"' % self.output
                      + ' -f=%i' % prank_number
                      + ' -convert')
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     message, error = cmdline()
     self.assertTrue("PRANK" in message, message)
     self.assertTrue(("converting '%s' to '%s'" % (self.input, filename))
                     in message, message)
     self.assertEqual(error, "")
     self.assertTrue(os.path.isfile(filename))
     old = AlignIO.read(self.input, "fasta")
     # Hack...
     if format == "phylip":
         for record in old:
             record.id = record.id[:10]
     new = AlignIO.read(filename, format)
     self.assertEqual(len(old), len(new))
     for old_r, new_r in zip(old, new):
         self.assertEqual(old_r.id, new_r.id)
         self.assertEqual(str(old_r.seq), str(new_r.seq))
     os.remove(filename)
Example #10
0
 def __init__(self, file_name=None, data = None, format='fasta'):
     if file_name:
         super(Alignment, self).__init__(AlignIO.read(file_name, format))
     elif data:
         super(Alignment, self).__init__(AlignIO.parse(StringIO(data), format))
     else:
         super(Alignment, self).__init__([])
Example #11
0
    def get_newick_tree(self):
        temp = None

        # quicktree expects a stockholm format input file
        if self.local_file.name and self.format == "stockholm":
            fname = self.local_file.path
        else:
            temp = tempfile.NamedTemporaryFile()
            print "writing stockholm format file..."
            AlignIO.write([self.biopy_alignment], temp, "stockholm")
            temp.flush()
            fname = temp.name

        print "opening quicktree on stockholm format file %s" % fname
        quicktree_out = os.popen('quicktree %s' % fname)   # subprocess.Popen hangs the Django dev server

        # there should be some elementary error checking here...
        newick_tree = quicktree_out.read()
        print "quicktree finished"

        if temp:
            # 'temp' is unlinked immediately after creation--so be sure to close it only after we're certain
            # that quicktree succesfully opened it (i.e, only after read(), not just after popen())
            temp.close()

        return newick_tree
Example #12
0
def tree(alignment,
         run_id = 'T%05i' % (0,),
         bionj = False):

  old_cwd = os.getcwd()
  new_wd = config.dataPath('phyml')
  if not os.path.isdir(new_wd): os.mkdir(new_wd)
  os.chdir(new_wd)

  infilepath = 'infile{0}'.format(run_id)
  infile = open(infilepath,'w')
  aio.write(alignment, infile, 'phylip')
  infile.close()


  command = 'phyml --quiet -i {0} -o {1} '.format(infilepath, 'n' if bionj else 'tlr' )
  print command
  subprocess.call(command,
                  shell = True,
                  stdout = subprocess.PIPE)
  treefilepath = infilepath + '_phyml_tree.txt'
  treefile = open(treefilepath)
  tree =phylo.read(treefile, 'newick')
  treefile.close()
  os.chdir(old_cwd)
  return tree
Example #13
0
def design_primers(source_dir, target_dir, settings, logfile):
    print("\nDesigning primers using PriFi...\n", file=logfile)
    # get rid of previous files
    utils.purge_dir(target_dir)
    aln_files = glob(os.path.join(source_dir, '*.fasta'))
    print("\tChecking for empty alignments...", file=logfile)
    for f in aln_files:
        try:
            align = AlignIO.read(f, 'fasta')
            filename = os.path.basename(f)
            shutil.copyfile(f, os.path.join(target_dir, filename))
        except Exception:
            print("[WARNING] Empty alignment file?! (%s)" % f, file=logfile)
            continue

    # call PriFi for actual primer design
    for f in glob(os.path.join(target_dir, '*.fasta')):
        aln = AlignIO.read(f, 'fasta')
        summary = AlignInfo.SummaryInfo(aln)
        l = aln.get_alignment_length()
        primerpairs = prifipy.findprimers(0, list(aln), summary, l, settings, logfile)
        if not primerpairs:
            print("%s: No valid primer pair found" % f, file=logfile)
        else:
            print('%s: Found %d primer pair suggestions. Writing primer files:' % (f, len(primerpairs)), file=logfile)
            prifipy.writePrimersToFiles(f, primerpairs, 1, logfile)
Example #14
0
def main(args):

    with open(args.fasta, 'r') as handle:
        align = AlignIO.read(handle, "fasta")

    to_delete = []
    old_length = align.get_alignment_length()
    logging.info('Examining {} columns of aligned fasta file'.format(old_length))
    for pos in range(old_length):
        column = align[ : , pos]
        if column == '-' * len(column):
            to_delete.append(pos)

    if len(to_delete) > 0:
        logging.info('Removing {} gap-only columns: {}'.format(len(to_delete), to_delete))
        to_delete.sort()
        to_delete.reverse()
        for pos in to_delete:
            align = align[:, :pos] + align[:, pos+1:]
        new_length = align.get_alignment_length()
        logging.info('Done! Old length: {}  New length: {}  Difference: {}'.
                     format(old_length, new_length, old_length-new_length))

    output_filename = os.path.basename(args.fasta) + '_degapped.fasta'
    with open(output_filename, 'w') as handle:
        AlignIO.write(align, handle, "fasta")
Example #15
0
def multiple_alignment(fasta_dict, alignment_type=SeqTypeData().TYPE_DEFAULT):
    in_handle = StringIO()
    fasta_tools.write_fasta_handle(in_handle, fasta_dict)

    muscle_cmd = SeqTypeData().type2cmd[alignment_type]
    child = subprocess.Popen(str(muscle_cmd), stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                             shell=(sys.platform != "win32"))
    if not child:
        print("Process was not created!")
        return

    if sys.version_info[0] == 3:
        child.stdin.write(bytes(in_handle.getvalue(), 'utf-8'))
        child.stdin.close()
        align = AlignIO.read(StringIO("".join(line.decode() for line in child.stdout)), "clustal")
    else:
        child.stdin.write(in_handle.getvalue())
        child.stdin.close()
        align = AlignIO.read(child.stdout, "clustal")
    fd = copy.deepcopy(fasta_dict)
    for a in align:
        fd.set(a.id, str(a.seq))

    return fd
Example #16
0
def split_family_seqs():
    alis_dir = cfg.dataPath('rfam/family_alis/')
    meta_dir = cfg.dataPath('rfam/family_metas/')

    fopen = open(cfg.dataPath('rfam/Rfam.seed'))
    alis = aio.parse(fopen,'stockholm')
    while 1:
        infos = {}
        start = fopen.tell()
        while 1:
            l = fopen.readline()       
            if l == '': break
            if l[0] == '#':
                ukey = str(l[5:7])
                infos.update( [(ukey, infos.get(ukey,'') + l[8:])])
            
            else:
                if l.strip() != '': break
        
        
        fopen.seek(start)
        ali = alis.next()
        if not ali:
            break
        rfname = infos['AC'].strip()
        alifile = open(os.path.join(alis_dir, rfname+'.fa'),'w')
        metafile = open(os.path.join(meta_dir, rfname+'.pickle'),'w')

        aio.write(ali, alifile, 'fasta')
        pickle.dump(infos, metafile)

        alifile.close()
        metafile.close()
def main():
    args = get_args()
    # iterate through all the files to determine the longest alignment
    files = get_files(args.input)
    all_taxa = set([])
    for count, f in enumerate(files):
        #new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        new_align = MultipleSeqAlignment([], generic_dna)
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                #pdb.set_trace()
                fname = os.path.splitext(os.path.basename(f))[0]
                new_seq_name = re.sub("^{}_*".format(fname), "", seq.name)
                all_taxa.add(new_seq_name)
                seq.id = new_seq_name
                seq.name = new_seq_name
                new_align.append(seq)
        assert len(all_taxa) == args.taxa, "Taxon names are not identical"
        outf = os.path.join(args.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        except ValueError:
            pdb.set_trace()
        print count
    print "Taxon names in alignments: {0}".format(','.join(list(all_taxa)))
Example #18
0
def _input(a):

    """The function converts alignments to matrix for further use.
  
    Arguments:
    -a- alignment file
    
    Example:
    >>>import os
    >>>import numpy as np
    >>>from Bio import AlignIO
    >>>_input("example.fasta")"""
        
    fileName, fileExtension = os.path.splitext(a)
    
    if fileExtension == ".phylip":
        try:
            l = list(AlignIO.read(a,"phylip"))
        except (ValueError):
            l = list(AlignIO.read(a,"phylip-relaxed"))
        except:
            pass
    elif fileExtension == ".fasta":
        l = list(AlignIO.read(a,"fasta"))
    else:
        raise Exception("Wrong format. Choose accepted format.")

    p = [[i for i in str(l[j].seq)] for j in range(0,len(l))]
    y = np.array(p)
    return(y)
Example #19
0
 def conversion(self, prank_number, prank_ext, format) :
     """Get PRANK to do a conversion, and check it with SeqIO."""
     filename = "%s.%s" % (self.output, prank_ext)
     if os.path.isfile(filename) :
         os.remove(filename)
     cmdline = PrankCommandline(prank_exe, d=self.input,
                                convert=True, f=prank_number,
                                o='"%s"' % self.output)
     self.assertEqual(str(cmdline), prank_exe \
                      + ' -d=%s' % self.input \
                      + ' -o="%s"' % self.output \
                      + ' -f=%i' % prank_number \
                      + ' -convert')
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     result, stdout, stderr = Application.generic_run(cmdline)
     self.assertEqual(result.return_code, 0)
     message = stdout.read().strip()
     self.assert_(("PRANK: converting '%s' to '%s'" % (self.input, filename)) \
                  in message, message)
     self.assertEqual(stderr.read(), "")
     self.assertEqual(str(result._cl), str(cmdline))
     self.assert_(os.path.isfile(filename))
     old = AlignIO.read(open(self.input), "fasta")
     #Hack...
     if format=="phylip" :
         for record in old :
             record.id = record.id[:10]
     new = AlignIO.read(open(filename), format)
     assert len(old) == len(new)
     for old_r, new_r in zip(old, new) :
         self.assertEqual(old_r.id, new_r.id)
         self.assertEqual(str(old_r.seq), str(new_r.seq))
     os.remove(filename)
Example #20
0
def add(alignment, sequence, timeout, logger, wd, threads):
    """Align sequence(s) to an alignment using mafft (external
program)"""
    alignment_file = "alignment_in.fasta"
    sequence_file = "sequence_in.fasta"
    output_file = "alignment_out.fasta" + ".fasta"
    command_line = "{0} --auto --thread {1} --add {2} {3} > {4}".format(
        mafft, threads, sequence_file, alignment_file, output_file
    )
    with open(os.path.join(wd, sequence_file), "w") as file:
        SeqIO.write(sequence, file, "fasta")
    with open(os.path.join(wd, alignment_file), "w") as file:
        AlignIO.write(alignment, file, "fasta")
    pipe = TerminationPipe(command_line, timeout=timeout, cwd=wd)
    pipe.run()
    os.remove(os.path.join(wd, alignment_file))
    os.remove(os.path.join(wd, sequence_file))
    if not pipe.failure:
        try:
            res = AlignIO.read(os.path.join(wd, output_file), "fasta")
        except:
            logger.info(pipe.output)
            raise MafftError()
        else:
            os.remove(os.path.join(wd, output_file))
    else:
        logger.debug(".... add timeout ....")
        return genNonAlignment(len(alignment) + 1, len(alignment.get_alignment_length()))
    return res
Example #21
0
 def check_bootstrap(self, filename, format, align_type="d"):
     """ check we can use fseqboot to pseudosample an alignment
     
     The align_type type argument is passed to the commandline object to
     set the output format to use (from [D]na,[p]rotein and [r]na )
     """
     self.assert_(os.path.isfile(filename), "Missing %s" % filename)
     cline = FSeqBootCommandline(exes["fseqboot"],
                                 sequence = filename,
                                 outfile =  "test_file",
                                 seqtype = align_type,
                                 reps = 2,
                                 auto = True, filter = True)
     return_code = run_command(cline)
     if return_code != 0:
         raise ValueError("Return code %s from:\n%s" \
                          % (return_code, str(cline)))
     # the resultant file should have 2 alignments...
     bs = list(AlignIO.parse(open("test_file", "r" ), format))
     self.assertEqual(len(bs), 2)
     # ..and each name in the original alignment...
     a_names = [s.name.replace(" ", "_") for s in
                AlignIO.read(open(filename, "r"), format)]
     # ...should be in each alignment in the bootstrapped file
     for a in bs:
         self.assertEqual(a_names, [s.name.replace(" ", "_") for s in a])
Example #22
0
 def conversion(self, prank_number, prank_ext, format):
     """Get PRANK to do a conversion, and check it with SeqIO."""
     filename = "%s.%s" % (self.output, prank_ext)
     if os.path.isfile(filename):
         os.remove(filename)
     cmdline = PrankCommandline(prank_exe, d=self.input, convert=True, f=prank_number, o='"%s"' % self.output)
     self.assertEqual(
         str(cmdline),
         prank_exe + " -d=%s" % self.input + ' -o="%s"' % self.output + " -f=%i" % prank_number + " -convert",
     )
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(
         str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform != "win32")
     )
     return_code = child.wait()
     self.assertEqual(return_code, 0)
     message = child.stdout.read().strip()
     self.assert_(("PRANK: converting '%s' to '%s'" % (self.input, filename)) in message, message)
     self.assertEqual(child.stderr.read(), "")
     self.assert_(os.path.isfile(filename))
     old = AlignIO.read(open(self.input), "fasta")
     # Hack...
     if format == "phylip":
         for record in old:
             record.id = record.id[:10]
     new = AlignIO.read(open(filename), format)
     assert len(old) == len(new)
     for old_r, new_r in zip(old, new):
         self.assertEqual(old_r.id, new_r.id)
         self.assertEqual(str(old_r.seq), str(new_r.seq))
     os.remove(filename)
     del child
Example #23
0
def load_tree(seqfname):
    """Load an alignment, build & prep a tree, return the tree object."""
    if seqfname.endswith('.aln'):
        aln = AlignIO.read(seqfname, 'clustal')
    elif seqfname.endswith('.fasta'):
        # Run MAFFT quickly
        alndata = subprocess.check_output(['mafft', '--quiet', '--auto',
                                           seqfname])
        aln = AlignIO.read(StringIO(alndata), 'fasta')
    else:
        raise ValueError("Input sequences must be a Clustal alignment (.aln) "
                         "or unaligned FASTA (.fasta)")

    # Use conserved (less-gappy) blocks to build the tree
    aln = alnutils.blocks(aln, 0.4)
    with tempfile.NamedTemporaryFile(mode='w') as tmp:
        AlignIO.write(aln, tmp, 'fasta')
        tmp.flush()
        treedata = subprocess.check_output(['fasttree',
                                            '-pseudo', '-gamma', '-wag',
                                            tmp.name])
    tree = Phylo.read(StringIO(treedata), 'newick')

    # Collapse weakly supported splits
    confs = [c.confidence
             for c in tree.find_clades()
             if c.confidence is not None]
    # ENH: accept min_confidence as an option
    min_confidence = math.fsum(confs) / len(confs)
    tree.collapse_all(lambda c: c.confidence < min_confidence)
    tree.ladderize(reverse=True)
    tree.root.branch_length = 0.0
    return tree
Example #24
0
def concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8, file9, file10):

    sequences1 = AlignIO.read(file1, 'fasta')
    sequences2 = AlignIO.read(file2, 'fasta')
    sequences3 = AlignIO.read(file3, 'fasta')
    sequences4 = AlignIO.read(file4, 'fasta')
    sequences5 = AlignIO.read(file5, 'fasta')
    sequences6 = AlignIO.read(file6, 'fasta')
    sequences7 = AlignIO.read(file7, 'fasta')
    sequences8 = AlignIO.read(file8, 'fasta')
    sequences9 = AlignIO.read(file9, 'fasta')
    sequences10 = AlignIO.read(file10, 'fasta')

    complete_sequences = []

    for sequence1 in sequences1:
        strain_name = util.get_strain_name(sequence1)
        sequence2 = util.get_matching_sequence(sequences2, strain_name=strain_name)
        sequence3 = util.get_matching_sequence(sequences3, strain_name=strain_name)
        sequence4 = util.get_matching_sequence(sequences4, strain_name=strain_name)
        sequence5 = util.get_matching_sequence(sequences5, strain_name=strain_name)
        sequence6 = util.get_matching_sequence(sequences6, strain_name=strain_name)
        sequence7 = util.get_matching_sequence(sequences7, strain_name=strain_name)
        sequence8 = util.get_matching_sequence(sequences8, strain_name=strain_name)
        sequence9 = util.get_matching_sequence(sequences9, strain_name=strain_name)
        sequence10 = util.get_matching_sequence(sequences10, strain_name=strain_name)

        if (sequence2 and sequence3 and sequence4 and sequence5 and sequence6 and sequence7 and sequence8 and sequence9 and sequence10):
            complete_sequence=[]
            complete_sequence.append(util.get_strain_name(sequence1))
            complete_sequence.append(sequence1.seq+sequence2.seq+sequence3.seq+sequence4.seq+sequence5.seq+sequence6.seq+sequence7.seq+sequence8.seq+sequence9.seq+sequence10.seq)
            complete_sequences.append(complete_sequence)

    return complete_sequences
Example #25
0
def writing(seqs,seq_descs,seq_ids, filename): #Arguments are sequence, description, ids, filename

	
	outdir = sys.argv[3] 					#Output directory
	if os.path.isdir(outdir):				#Checks the presence of directory
		print "Directory exists. New directory not created"
	else: 
		command= "mkdir "+ outdir 
		os.system(command)
								#outpath defines path of the subfolder we want to store results in 
			        
	outpath = outdir + '/' + sys.argv[1]
	command = "mkdir " + outpath
	os.system(command)

								#write the result to output
        align = MultipleSeqAlignment([])
	output_file = outpath + '/' + filename + '.' + 'output'
	#print output_file
								#path = outdir + '/'+ output_file
        
	for i in range(len(seqs)):
                align.append(SeqRecord(Seq(seqs[i],generic_protein),id=seq_ids[i],description=seq_descs[i]))
                
        AlignIO.write(align, output_file ,"fasta")
Example #26
0
    def __init__(self,aln,treef,cmd=None):
        if os.path.isfile(aln):
            self.alnfile=aln
            self.aln = AlignIO.read(open(self.alnfile),'fasta')
        else:
            self.aln=aln
            self.alnfile = tempfile.NamedTemporaryFile()
            AlignIO.write(aln,self.alnfile,'fasta')
            self.alnfile.flush()
        if not cmd:
            import sys
            if sys.maxint==9223372036854775807: #64 bit
                cmd='rate4site64'
            else:
                cmd='rate4site'
        if isinstance(treef,dendropy.Tree):
            parent_tree=treef
        elif os.path.isfile(treef):
            parent_tree=dendropy.Tree.get_from_path(treef,'newick')
        self.tree = narrow_tree(parent_tree,self.aln)
        self.treefile = tempfile.NamedTemporaryFile()
#        self.tree.write(self.treefile,'newick',internal_labels=False)
        self.treefile.write(self.tree.as_string('newick',internal_labels=False)[5:])
        self.treefile.flush()
        self.cmd=cmd
Example #27
0
File: ks.py Project: ascendo/jcvi
def muscle_align_protein(recs, work_dir, outfmt="fasta", inputorder=True):
    """
    Align given proteins with muscle.
    recs are iterable of Biopython SeqIO objects
    """
    fasta_file = op.join(work_dir, "prot-start.fasta")
    align_file = op.join(work_dir, "prot.aln")
    SeqIO.write(recs, file(fasta_file, "w"), "fasta")

    muscle_cl = MuscleCommandline(cmd=MUSCLE_BIN("muscle"),
            input=fasta_file, out=align_file, seqtype="protein",
            clwstrict=True)
    stdout, stderr = muscle_cl()
    alignment = AlignIO.read(muscle_cl.out, "clustal")

    if inputorder:
        try:
            muscle_inputorder(muscle_cl.input, muscle_cl.out)
        except ValueError:
            return ""
        alignment = AlignIO.read(muscle_cl.out, "fasta")

    print >>sys.stderr, "\tDoing muscle alignment: %s" % muscle_cl
    if outfmt == "fasta":
        return alignment.format("fasta")
    if outfmt == "clustal":
        return alignment.format("clustal")
def check_convert(in_filename, in_format, out_format, alphabet=None):
    # Write it out using parse/write
    handle = StringIO()
    aligns = list(AlignIO.parse(in_filename, in_format, None, alphabet))
    try:
        count = AlignIO.write(aligns, handle, out_format)
    except ValueError:
        count = 0
    # Write it out using convert passing filename and handle
    handle2 = StringIO()
    try:
        count2 = AlignIO.convert(in_filename, in_format, handle2, out_format, alphabet)
    except ValueError:
        count2 = 0
    assert count == count2
    assert handle.getvalue() == handle2.getvalue()
    # Write it out using convert passing handle and handle
    handle2 = StringIO()
    try:
        with open(in_filename) as handle1:
            count2 = AlignIO.convert(handle1, in_format, handle2, out_format, alphabet)
    except ValueError:
        count2 = 0
    assert count == count2
    assert handle.getvalue() == handle2.getvalue()
Example #29
0
  def filter_out_alignments_with_too_much_missing_data(input_filename, output_filename, filter_percentage,verbose):
    input_handle  = open(input_filename, "rU")
    output_handle = open(output_filename, "w+")
    alignments = AlignIO.parse(input_handle, "fasta")
    output_alignments = []
    taxa_removed = []
    number_of_included_alignments = 0
    for alignment in alignments:
        for record in alignment:
          number_of_gaps = 0
          number_of_gaps += record.seq.count('n')
          number_of_gaps += record.seq.count('N')
          number_of_gaps += record.seq.count('-')
          sequence_length = len(record.seq)

          if sequence_length == 0:
            taxa_removed.append(record.id)
            print "Excluded sequence " + record.id + " because there werent enough bases in it"
          elif((number_of_gaps*100/sequence_length) <= filter_percentage):
            output_alignments.append(record)
            number_of_included_alignments += 1
          else:
            taxa_removed.append(record.id)
            print "Excluded sequence " + record.id + " because it had " + str(number_of_gaps*100/sequence_length) +" percentage gaps while a maximum of "+ str(filter_percentage) +" is allowed"

    if number_of_included_alignments <= 1:
      sys.exit("Too many sequences have been excluded so theres no data left to work with. Please increase the -f parameter")

    AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta")
    output_handle.close()
    input_handle.close()
    return taxa_removed
Example #30
0
def build_ml_raxml(alignment, outfile, work_dir=".", **kwargs):
    """
    build maximum likelihood tree of DNA seqs with RAxML
    """
    work_dir = op.join(work_dir, "work")
    mkdir(work_dir)
    phy_file = op.join(work_dir, "aln.phy")
    AlignIO.write(alignment, file(phy_file, "w"), "phylip-relaxed")

    raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work"))
    mkdir(raxml_work)
    raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \
        sequences=phy_file, algorithm="a", model="GTRGAMMA", \
        parsimony_seed=12345, rapid_bootstrap_seed=12345, \
        num_replicates=100, name="aln", \
        working_dir=raxml_work, **kwargs)

    logging.debug("Building ML tree using RAxML: %s" % raxml_cl)
    stdout, stderr = raxml_cl()

    tree_file = "{0}/RAxML_bipartitions.aln".format(raxml_work)
    if not op.exists(tree_file):
        print("***RAxML failed.", file=sys.stderr)
        sh("rm -rf %s" % raxml_work, log=False)
        return None
    sh("cp {0} {1}".format(tree_file, outfile), log=False)

    logging.debug("ML tree printed to %s" % outfile)
    sh("rm -rf %s" % raxml_work)

    return outfile, phy_file
Example #31
0
    def test_read_write_clustal(self):
        """Test the base alignment stuff."""
        path = os.path.join(os.getcwd(), "Clustalw", "opuntia.aln")
        alignment = AlignIO.read(path,
                                 "clustal",
                                 alphabet=Alphabet.Gapped(
                                     IUPAC.unambiguous_dna))
        self.assertEqual(len(alignment), 7)
        seq_record = alignment[0]
        self.assertEqual(seq_record.description,
                         "gi|6273285|gb|AF191659.1|AF191")
        self.assertEqual(
            seq_record.seq,
            Seq("TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCCATTGATTTAGTGTACCAGA"
                ))
        seq_record = alignment[1]
        self.assertEqual(seq_record.description,
                         "gi|6273284|gb|AF191658.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATA--------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[2]
        self.assertEqual(seq_record.description,
                         "gi|6273287|gb|AF191661.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[3]
        self.assertEqual(seq_record.description,
                         "gi|6273286|gb|AF191660.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTATAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[4]
        self.assertEqual(seq_record.description,
                         "gi|6273290|gb|AF191664.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[5]
        self.assertEqual(seq_record.description,
                         "gi|6273289|gb|AF191663.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTATACCAGA"
        )
        seq_record = alignment[6]
        self.assertEqual(seq_record.description,
                         "gi|6273291|gb|AF191665.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        self.assertEqual(alignment.get_alignment_length(), 156)
        align_info = AlignInfo.SummaryInfo(alignment)
        consensus = align_info.dumb_consensus()
        self.assertIsInstance(consensus, Seq)
        self.assertEqual(
            consensus,
            "TATACATTAAAGXAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTXCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        dictionary = align_info.replacement_dictionary(["N"])
        self.assertEqual(len(dictionary), 16)
        self.assertAlmostEqual(dictionary[("A", "A")], 1395.0, places=1)
        self.assertAlmostEqual(dictionary[("A", "C")], 3.0, places=1)
        self.assertAlmostEqual(dictionary[("A", "G")], 13.0, places=1)
        self.assertAlmostEqual(dictionary[("A", "T")], 6.0, places=1)
        self.assertAlmostEqual(dictionary[("C", "A")], 3.0, places=1)
        self.assertAlmostEqual(dictionary[("C", "C")], 271.0, places=1)
        self.assertAlmostEqual(dictionary[("C", "G")], 0, places=1)
        self.assertAlmostEqual(dictionary[("C", "T")], 16.0, places=1)
        self.assertAlmostEqual(dictionary[("G", "A")], 5.0, places=1)
        self.assertAlmostEqual(dictionary[("G", "C")], 0, places=1)
        self.assertAlmostEqual(dictionary[("G", "G")], 480.0, places=1)
        self.assertAlmostEqual(dictionary[("G", "T")], 0, places=1)
        self.assertAlmostEqual(dictionary[("T", "A")], 6.0, places=1)
        self.assertAlmostEqual(dictionary[("T", "C")], 12.0, places=1)
        self.assertAlmostEqual(dictionary[("T", "G")], 0, places=1)
        self.assertAlmostEqual(dictionary[("T", "T")], 874.0, places=1)
        matrix = align_info.pos_specific_score_matrix(consensus, ["N"])
        self.assertEqual(
            str(matrix), """\
    A   C   G   T
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  1.0 0.0 0.0 6.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
X  4.0 0.0 3.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 4.0
A  4.0 0.0 0.0 0.0
T  0.0 0.0 0.0 3.0
A  3.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
C  1.0 6.0 0.0 0.0
A  6.0 0.0 0.0 1.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
X  0.0 3.0 0.0 4.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 2.0 0.0 5.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 1.0 0.0 6.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
T  0.0 0.0 0.0 7.0
G  1.0 0.0 6.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
""")

        matrix = align_info.pos_specific_score_matrix(chars_to_ignore=["N"])
        self.assertEqual(
            str(matrix), """\
    A   C   G   T
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  1.0 0.0 0.0 6.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
X  4.0 0.0 3.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 4.0
A  4.0 0.0 0.0 0.0
T  0.0 0.0 0.0 3.0
A  3.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
C  1.0 6.0 0.0 0.0
A  6.0 0.0 0.0 1.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
X  0.0 3.0 0.0 4.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 2.0 0.0 5.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 1.0 0.0 6.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
T  0.0 0.0 0.0 7.0
G  1.0 0.0 6.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
""")

        second_seq = alignment[1].seq
        matrix = align_info.pos_specific_score_matrix(second_seq, ["N"])
        self.assertEqual(
            str(matrix), """\
    A   C   G   T
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  1.0 0.0 0.0 6.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  4.0 0.0 3.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 4.0
A  4.0 0.0 0.0 0.0
-  0.0 0.0 0.0 3.0
-  3.0 0.0 0.0 0.0
-  0.0 0.0 0.0 1.0
-  1.0 0.0 0.0 0.0
-  0.0 0.0 0.0 1.0
-  1.0 0.0 0.0 0.0
-  0.0 0.0 0.0 1.0
-  1.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
C  1.0 6.0 0.0 0.0
A  6.0 0.0 0.0 1.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 3.0 0.0 4.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 2.0 0.0 5.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 1.0 0.0 6.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
T  0.0 0.0 0.0 7.0
G  1.0 0.0 6.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
""")
        value = align_info.information_content(5, 50, chars_to_ignore=["N"])
        self.assertAlmostEqual(value, 88.42, places=2)
        value = align_info.information_content(chars_to_ignore=["N"])
        self.assertAlmostEqual(value, 287.55, places=2)
        e_freq = {"G": 0.25, "C": 0.25, "A": 0.25, "T": 0.25}
        e_freq_table = FreqTable.FreqTable(e_freq, FreqTable.FREQ,
                                           IUPAC.unambiguous_dna)
        value = align_info.information_content(e_freq_table=e_freq_table,
                                               chars_to_ignore=["N"])
        self.assertAlmostEqual(value, 287.55, places=2)
        self.assertEqual(align_info.get_column(1), "AAAAAAA")
        self.assertAlmostEqual(align_info.ic_vector[1], 2.00, places=2)
        self.assertEqual(align_info.get_column(7), "TTTATTT")
        self.assertAlmostEqual(align_info.ic_vector[7], 1.41, places=2)
        handle = StringIO()
        AlignInfo.print_info_content(align_info, fout=handle)
        self.assertEqual(
            handle.getvalue(), """\
0 T 2.000
1 A 2.000
2 T 2.000
3 A 2.000
4 C 2.000
5 A 2.000
6 T 2.000
7 T 1.408
8 A 2.000
9 A 2.000
10 A 2.000
11 G 2.000
12 A 1.015
13 A 2.000
14 G 2.000
15 G 2.000
16 G 2.000
17 G 2.000
18 G 2.000
19 A 2.000
20 T 2.000
21 G 2.000
22 C 2.000
23 G 2.000
24 G 2.000
25 A 2.000
26 T 2.000
27 A 2.000
28 A 2.000
29 A 2.000
30 T 2.000
31 G 2.000
32 G 2.000
33 A 2.000
34 A 2.000
35 A 2.000
36 G 2.000
37 G 2.000
38 C 2.000
39 G 2.000
40 A 2.000
41 A 2.000
42 A 2.000
43 G 2.000
44 A 2.000
45 A 2.000
46 A 2.000
47 G 2.000
48 A 2.000
49 A 2.000
50 T 2.000
51 A 2.000
52 T 2.000
53 A 2.000
54 T 2.000
55 A 2.000
56 - 0.682
57 - 0.682
58 - 0.333
59 - 0.333
60 - -0.115
61 - -0.115
62 - -0.115
63 - -0.115
64 - -0.115
65 - -0.115
66 A 2.000
67 T 2.000
68 A 2.000
69 T 2.000
70 A 2.000
71 T 2.000
72 T 2.000
73 T 2.000
74 C 1.408
75 A 1.408
76 A 2.000
77 A 2.000
78 T 2.000
79 T 2.000
80 T 1.015
81 C 2.000
82 C 2.000
83 T 2.000
84 T 2.000
85 A 2.000
86 T 2.000
87 A 2.000
88 T 2.000
89 A 2.000
90 C 1.137
91 C 2.000
92 C 2.000
93 A 2.000
94 A 2.000
95 A 2.000
96 T 2.000
97 A 2.000
98 T 2.000
99 A 2.000
100 A 2.000
101 A 2.000
102 A 2.000
103 A 2.000
104 T 2.000
105 A 2.000
106 T 2.000
107 C 2.000
108 T 2.000
109 A 2.000
110 A 2.000
111 T 2.000
112 A 2.000
113 A 2.000
114 A 2.000
115 T 2.000
116 T 2.000
117 A 2.000
118 G 2.000
119 A 2.000
120 T 2.000
121 G 2.000
122 A 2.000
123 A 2.000
124 T 2.000
125 A 2.000
126 T 2.000
127 C 2.000
128 A 2.000
129 A 2.000
130 A 2.000
131 G 2.000
132 A 2.000
133 A 2.000
134 T 2.000
135 C 2.000
136 C 1.408
137 A 2.000
138 T 2.000
139 T 2.000
140 G 2.000
141 A 2.000
142 T 2.000
143 T 2.000
144 T 2.000
145 A 2.000
146 G 2.000
147 T 2.000
148 G 1.408
149 T 2.000
150 A 2.000
151 C 2.000
152 C 2.000
153 A 2.000
154 G 2.000
155 A 2.000
""")
 def read_gene(self, gene):
     return ali_upper(
         AlignIO.read(
             os.path.join(self.path, 'binning.fulltree', gene,
                          gene + '.fasta'), 'fasta'))
 def save_atree(self, t, a, gene):
     bfn = os.path.join(self.path, 'genes', gene)
     t.write(path=bfn + '.nwk', unquoted_underscores=True, schema='newick')
     f = open(bfn + '.fasta', 'w')
     AlignIO.write(a, f, 'fasta')
     f.close()
Example #34
0
from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.sth", "stockholm")
print ("Aligment length %i" % alignment.get_alignment_length())
for record in alignment:
    print("%s - %s" % (record.seq, record.id))

print ("Luz Maria Rosas Salcedo")

Example #35
0
    from Bio import AlignIO
    import re
    from common import is_valid_file

    parser = argparse.ArgumentParser(
        description='Run Phylip bootstrap analysis on protein alignments')
    parser.add_argument('file',
                        type=lambda f: is_valid_file(f, parser),
                        help='Protein alignment file in FASTA format')
    parser.add_argument('num_replicates',
                        help='Number of bootstrap replicates to run')
    args = parser.parse_args()

    short_acc_to_long_acc = {}

    alignment = AlignIO.read(args.file, 'fasta')
    tempdir = tempfile.TemporaryDirectory()
    os.chdir(tempdir.name)
    infile = tempfile.NamedTemporaryFile(mode='w+t')
    # Remove zero padding
    for align in alignment:
        compressed_accession = re.sub(r'(?<=\D)0+', '', align.id)
        short_acc_to_long_acc[compressed_accession[:10]] = align.id
        align.id = compressed_accession
    AlignIO.write(alignment, infile, 'phylip')
    infile.seek(0)

    seqboot = pexpect.spawn('seqboot', encoding='utf-8')
    seqboot.expect(str('Please enter a new file name'))
    seqboot.sendline(infile.name)
    seqboot.expect('Y to accept')
Example #36
0
# @Project: M_BioPy
# @Last modified time: 2019-04-15T11:53:16+08:00
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO

# Set the File Path
base = r"C:\Users\Nature\Desktop\M_BioPy\exp\material\BioPy_exp4"
clustalw_exe = base + r"\clustalw2.exe"
in_file = base + r"\inputFasta"
out_file = base + r"\OutFasta.aln"

# Do the Alignment
clustalw_cline = ClustalwCommandline(clustalw_exe,
                                     infile=in_file,
                                     outfile=out_file)
clustalw_cline()

# Read the Alignment
alignment = AlignIO.read(out_file, "clustal")

# Write the Annotation
annotationOutFile = base + r"\OutAnnotation.txt"
with open(annotationOutFile, "wt") as outfile:
    for record in alignment:
        outfile.write(str(record) + "\n\n")

# Split the alignment
newAlignmentFile = base + r"\OutFasta2.aln"
newAlignment = alignment[:, :10] + alignment[:, -10:]
AlignIO.write(newAlignment, newAlignmentFile, "clustal")
Example #37
0
 def test_format_conversion(self):
     """Parse the alignment file and get an aligment object."""
     path = os.path.join(os.curdir, "Clustalw", "opuntia.aln")
     alignment = AlignIO.read(path, "clustal")
     self.assertEqual(format(alignment, "fasta"), opuntia_fasta)
     self.assertEqual(format(alignment, "clustal"), opuntia_clustal)
Example #38
0
 def test_read_clustal1(self):
     """Parse an alignment file and get an aligment object."""
     path = os.path.join(os.getcwd(), "Clustalw", "opuntia.aln")
     alignment = AlignIO.read(path, "clustal")
     self.assertEqual(format(alignment, "clustal"), opuntia_clustal)
Example #39
0
    records = []
    for i, seq in enumerate(seqs):
        seqObj = Seq(seq, IUPAC.protein)
        name = 'test%d' % i
        recordObj = SeqRecord(seqObj, id=name, description='demo only')
        records.append(recordObj)

    outFileObj = open(fastaFileName, "w")
    SeqIO.write(records, outFileObj, "fasta")
    outFileObj.close()

    cmdArgs = [
        'clustalw', '-INFILE=' + fastaFileName, '-OUTFILE=' + alignFileName
    ]
    call(cmdArgs)

    fileObj = open(alignFileName)
    alignment = AlignIO.read(fileObj, "clustal")

    print('\nClustalW alignment\n')

    print("Alignment length %i" % alignment.get_alignment_length())
    for record in alignment:
        print(record.seq, record.id)

    alignments = [
        alignment,
    ]
    outputHandle = open("test2.phylip", "w")
    AlignIO.write(alignments, outputHandle, "phylip")
Example #40
0
 def test_read_clustal2(self):
     """Parse an alignment file and get an aligment object."""
     path = os.path.join(os.curdir, "Clustalw", "cw02.aln")
     alignment = AlignIO.read(path, "clustal")
     self.assertEqual(format(alignment, "clustal"), cw02_clustal)
Example #41
0
            i += 1
    return align


def cut_gap_in_blocks():  # usefull for the future, not for this script
    n = float(len(align[0]))
    i = 0
    while i < n:
        ct = 0
        while i + ct < n and align[:, i + ct].count('-') / n > 0.5:
            ct += 1

        if ct > 0:  # delete columns [i:i+ct]
            if i == 0:
                align = align[:, ct:]
            elif i + ct == n:
                align = align[:, :i]
            else:
                align = align[:, :i] + align[:, i + ct:]
            n -= ct  # seq. ct positions shorter
        else:  # nothing to delete, proceed
            i += 1
        return align


alignment = AlignIO.read("in.txt", "stockholm")
# print(alignment)
out = cut_a_gap(alignment)

SeqIO.write(out, "out.txt", "clustal")
                            opt_file,
                            a_ratio=1.0,
                            m_ratio=1.0):

    m_base = _type[0]
    a_base = _type[1]

    opt = open(opt_file, 'w')
    for i in range(len(a_aln[1, :])):
        col_a_group_nogap = [i for i in a_aln[:, i]]
        col_m_group_nogap = [i for i in m_aln[:, i] if i != '-']
        a_ratio_est = col_a_group_nogap.count(a_base) / len(col_a_group_nogap)
        m_ratio_est = col_m_group_nogap.count(m_base) / correct_empty_lst(
            col_m_group_nogap)
        if (a_ratio_est >= a_ratio) and (m_ratio_est >= m_ratio):
            opt.write(str(i + 1) + '\n')
    opt.close()


if __name__ == "__main__":

    pars = read_args(sys.argv)
    aln_ipt = AlignIO.read(pars['ipt_msa'], 'fasta')
    a_aln_obj = aln_partition(aln_ipt)[0]
    m_aln_obj = aln_partition(aln_ipt)[1]

    suspicious_pos_detector(a_aln_obj, m_aln_obj, 'CT', pars['output_CT'],
                            pars['ancient_ratio'], pars['modern_ratio'])
    suspicious_pos_detector(a_aln_obj, m_aln_obj, 'GA', pars['output_GA'],
                            pars['ancient_ratio'], pars['modern_ratio'])
Example #43
0
def run_tree(fname,
             out_prefix,
             alphabet,
             true_tree=False,
             true_model=False,
             pc=0.1,
             true_rates=False):
    """
    read a tree and an alignment and optimize its branch length using different types of models.
    the use can specify to either use the true model for optimization, just the true rates, or
    infer the entire model from the data. The either the true or an inferred tree-topology can
    be used.
    """
    params = parse_alignment_name(fname)
    params['pc'] = pc
    prefix = os.path.dirname(fname)
    m = params['m']
    tree = Phylo.read(tree_name(prefix, params),
                      'newick') if true_tree else Phylo.read(
                          reconstructed_tree_name(prefix, params), 'newick')
    tree.root.branch_length = 0.001

    old_bl = []
    print(
        np.mean([x for c, x in tree.depths().items() if c.is_terminal()]) *
        (m if true_tree else 1.0))
    print(tree.root.clades[0].branch_length /
          tree.root.clades[1].branch_length)

    # randomize branch length of true tree to allow fair comparison
    for n in tree.find_clades():
        old_bl.append(n.branch_length)
        if true_tree:
            # rescale with mutation rate and multiply by a random number between 0.6 and 1.0
            n.branch_length *= m * (0.6 + 0.4 * np.random.random())

    print(np.sum(old_bl) * (m if true_tree else 1.0), m)
    # load true GTR model. Use this for inference if true_tree=True, else start with Jukes Cantor
    true_GTR = load_model(model_name(prefix, params))
    if true_model:
        model = true_GTR
        model.mu /= m
    else:
        model = 'JC69'

    with gzip.open(alignment_name(prefix, params), 'rt') as fh:
        aln = AlignIO.read(fh, 'fasta')

    tt = TreeAnc(tree=tree,
                 aln=aln,
                 gtr=model,
                 compress=False,
                 alphabet=alphabet,
                 verbose=3)

    # run the tree optimization of treetime. the damping parameter slows down the iterative
    # branch length optimization to avoid oscillations and run-away solutions
    # a site-specific GTR model is inferred if true_model is False
    tt.optimize_tree(branch_length_mode='marginal',
                     max_iter=n_iter,
                     infer_gtr=not true_model,
                     site_specific_gtr=True,
                     pc=pc,
                     damping=0.75)

    # if the true raes are to be used, replace those in the model and re-optimize
    if true_rates:
        tt.gtr.mu = true_GTR.mu / m
        tt.optimize_tree(branch_length_mode='marginal',
                         max_iter=n_iter,
                         infer_gtr=False,
                         site_specific_gtr=True,
                         pc=pc,
                         damping=0.75)

    new_bl = []
    for n in tt.tree.find_clades():
        new_bl.append(n.branch_length)

    # save new tree to file
    tt.tree.root_at_midpoint()
    tfname = reoptimized_tree_true_model(
        out_prefix, params) if args.true_model else reoptimized_tree(
            out_prefix, params, true_rates=true_rates)
    Phylo.write(tt.tree, tfname, 'newick')

    print(tt.tree.total_branch_length(), tt.gtr.average_rate().mean())
    print(np.mean([x for c, x in tt.tree.depths().items() if c.is_terminal()]),
          tt.tree.total_branch_length())
    print(tt.tree.root.clades[0].branch_length /
          tt.tree.root.clades[1].branch_length)
    print(np.corrcoef(old_bl, new_bl)[0, 1])
# Check for unique ref names
if args.ref1name == args.ref2name:
    print('You cannot specify the same reference twice. Quitting.',
          file=sys.stderr)
    exit(1)

# No commas in reference names
if ',' in args.ref1name or ',' in args.ref2name:
    print('Reference names may not contain commas. Rename the sequences in',
          args.alignment,
          'and try again. Quitting.',
          file=sys.stderr)
    exit(1)

# Find the consensus and its ref
alignment = AlignIO.read(args.alignment, "fasta")
ref1seq = None
ref2seq = None
for seq in alignment:
    if seq.id == args.ref1name:
        if ref1seq != None:
            print('Found',
                  args.ref1name,
                  'twice in',
                  args.alignment + '. Quitting.',
                  file=sys.stderr)
            quit(1)
        ref1seq = str(seq.seq)
    if seq.id == args.ref2name:
        if ref2seq != None:
            print('Found',
Example #45
0
    "P8": [('13984498', '14019652')]
}

#List of species names in the MAF file
species_list = ["panPan_Y", "panTro_Y", "gorGor_Y", "ponAbe_Y", "hg_Y"]

#all Y chromosome alignment file.
#/nfs/brubeck.bx.psu.edu/scratch6/rahul/Bonobo_Y/analysis/palindrome/palindrome_coverage/multi_alignment_based
input_handle = open("msa/alignment.hg_Y_centric.20191126.maf", "rU")
output_handle = open(inputP + "_sizeNonRepeatBlock.tab", "w")

#Obtaining the palindrome of interest
Palindrome = Palindrome_coordinates[inputP]

#File handle of MAF
alignments = AlignIO.parse(input_handle, "maf")

#Block overlap cutoff

percent_cutoff = 0.95
pal_size = (int(Palindrome[0][1]) - int(Palindrome[0][0])) + 1
sequence_Bon = [0] * pal_size
sequence_Gor = [0] * pal_size
sequence_Orang = [0] * pal_size
sequence_Chimp = [0] * pal_size
#sequence_Human=[0]*pal_size
#Reading each block in the MAF file
for msa in alignments:
    for region in Palindrome:  #Obtain the location of palindromes
        region_start = region[0]
        region_end = region[1]
Example #46
0
from Bio import AlignIO
import glob
import statistics
#####################################################################
#here the actual program begins, i'm storing the file names and
#annotations in their respective lists
#####################################################################
nomes = []
sizes = []
scores = []
identities = []
similarities = []
gaps = []
for file in glob.glob("*.align"):
    current = AlignIO.read(open(file), "emboss")
    size = current.get_alignment_length()
    nomes = nomes + [file]
    scores = scores + [current.annotations["score"]]
    identities = identities + [current.annotations["identity"] / size * 100]
    similarities = similarities + [
        current.annotations["similarity"] / size * 100
    ]
    gaps = gaps + [current.annotations["gaps"] / size * 100
                   ]  #im storing the percentages
    sizes = sizes + [size]
#####################################################################
#now i have to take the elements of these lists and make decrescent
#sorted ones (by the scores)
#####################################################################
nomes2 = []
scores2 = []
Example #47
0
        elif "NM_031542.2" == line.id:
            record = SeqRecord(line.seq, "Rat")
            fixed_sequences.append(record)
        elif "NM_204276.2" == line.id:
            record = SeqRecord(line.seq, "Chicken")
            fixed_sequences.append(record)

    SeqIO.write(fixed_sequences, output_handle, "fasta")

input_handle.close()
output_handle.close()


# convert the clustalW format to phylip for the program
from Bio import AlignIO
AlignIO.convert("BRCA2_family_fixed.fasta", "fasta", "BRCA2_family.phy", "phylip")

# Read the sequences and align
aln = AlignIO.read('BRCA2_family.phy', 'phylip')

# create a starting tree with NJ
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(aln)
constructor = DistanceTreeConstructor(calculator, 'nj')
starting_tree = constructor.build_tree(aln)


# A substitution cost matrix, used from in-lecture excise (penalty of 2 for transversion and gap, penalty of 1 for
# # transition)
cost_matrix = [[0],
            [2,0],
Example #48
0
    Tree_names[i]: Genomes_names[i]
    for i in range(0, len(Genomes_names))
}
Reference_species = Ref_tags['Species'].to_list()
Reference_idents = Ref_tags['Tag'].to_list()
Refs_tag_dict = {
    Reference_species[i]: Reference_idents[i]
    for i in range(0, len(Reference_species))
}
"""
FUNCTIONS
"""

refs_dict = {}

alignment_obj = AlignIO.read(cds_alignment, "clustal")
for record in alignment_obj:
    curr = record.id
    if "PD" not in curr:
        refs_dict[curr] = []
    else:
        for key1, value1 in Refs_tag_dict.items():
            if curr.startswith(str(value1)):
                genome = key1
        for key2, value2 in Species_tag_dict.items():
            if value2 == genome:
                refs_dict[key2].append(curr)

#Let's iterate and take the human sequence as our reference
human_aligned_cds = [
    record.seq for record in alignment_obj if record.id == "Homo_sapiens"
Example #49
0
def write(sequences, handle, format):
    """Write complete set of sequences to a file.

    Arguments:
     - sequences - A list (or iterator) of SeqRecord objects, or (if using
       Biopython 1.54 or later) a single SeqRecord.
     - handle    - File handle object to write to, or filename as string
       (note older versions of Biopython only took a handle).
     - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of records written (as an integer).
    """
    from Bio import AlignIO

    # Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(handle, SeqRecord):
        raise TypeError("Check arguments, handle should NOT be a SeqRecord")
    if isinstance(handle, list):
        # e.g. list of SeqRecord objects
        raise TypeError("Check arguments, handle should NOT be a list")

    if isinstance(sequences, SeqRecord):
        # This raised an exception in older versions of Biopython
        sequences = [sequences]

    if format in _BinaryFormats:
        mode = 'wb'
    else:
        mode = 'w'

    with as_handle(handle, mode) as fp:
        # Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(sequences)
        elif format in AlignIO._FormatToWriter:
            # Try and turn all the records into a single alignment,
            # and write that using Bio.AlignIO
            alignment = MultipleSeqAlignment(sequences)
            alignment_count = AlignIO.write([alignment], fp, format)
            assert alignment_count == 1, \
                "Internal error - the underlying writer " \
                " should have returned 1, not %r" % alignment_count
            count = len(alignment)
            del alignment_count, alignment
        elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
            raise ValueError(
                "Reading format '%s' is supported, but not writing" % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

        assert isinstance(count, int), "Internal error - the underlying %s " \
            "writer should have returned the record count, not %r" \
            % (format, count)

    return count
Example #50
0
 def __init__(self, file):
     self.__alignment = AlignIO.read(file, "fasta")
inactiveThreshold = float(sys.argv[3])
activityInfile = sys.argv[4]
activityColumn = sys.argv[5]

if len(sys.argv) > 6:
    treeFile = sys.argv[6]

substMatrices = Bio.SubsMat.MatrixInfo.available_matrices
actives = getActives(activeThreshold, activityInfile, activityColumn,
                     operator.lt)
inactives = getActives(inactiveThreshold, activityInfile, activityColumn,
                       operator.gt)

if not treeFile:
    from Bio import AlignIO
    pocketAlignment = AlignIO.read(open(pocketAlignmentFile), "fasta")
    print("Calculating Distance Matrix")
    for substMat in substMatrices:
        try:
            alnFile = pocketAlignmentFile.split("/")[-1]
            fname = "autotree/" + substMat + "_" + alnFile + ".pxml"
            print(fname)
            if (os.path.exists(fname)):
                tree = Phylo.read(fname, "phyloxml")
            else:
                calculator = DistanceCalculator(substMat)
                dm = calculator.get_distance(pocketAlignment)
                print("Building tree")
                constructor = DistanceTreeConstructor(calculator, 'nj')
                tree = constructor.build_tree(pocketAlignment)
                Phylo.write(tree, fname, "phyloxml")
Example #52
0
def parse(handle, format, alphabet=None):
    r"""Turns a sequence file into an iterator returning SeqRecords.

    Arguments:
     - handle   - handle to the file, or the filename as a string
       (note older versions of Biopython only took a handle).
     - format   - lower case string describing the file format.
     - alphabet - optional Alphabet object, useful when the sequence type
       cannot be automatically inferred from the file itself
       (e.g. format="fasta" or "tab")

    Typical usage, opening a file to read in, and looping over the record(s):

    >>> from Bio import SeqIO
    >>> filename = "Fasta/sweetpea.nu"
    >>> for record in SeqIO.parse(filename, "fasta"):
    ...    print("ID %s" % record.id)
    ...    print("Sequence length %i" % len(record))
    ...    print("Sequence alphabet %s" % record.seq.alphabet)
    ID gi|3176602|gb|U78617.1|LOU78617
    Sequence length 309
    Sequence alphabet SingleLetterAlphabet()

    For file formats like FASTA where the alphabet cannot be determined, it
    may be useful to specify the alphabet explicitly:

    >>> from Bio import SeqIO
    >>> from Bio.Alphabet import generic_dna
    >>> filename = "Fasta/sweetpea.nu"
    >>> for record in SeqIO.parse(filename, "fasta", generic_dna):
    ...    print("ID %s" % record.id)
    ...    print("Sequence length %i" % len(record))
    ...    print("Sequence alphabet %s" % record.seq.alphabet)
    ID gi|3176602|gb|U78617.1|LOU78617
    Sequence length 309
    Sequence alphabet DNAAlphabet()

    If you have a string 'data' containing the file contents, you must
    first turn this into a handle in order to parse it:

    >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
    >>> from Bio import SeqIO
    >>> try:
    ...     from StringIO import StringIO # Python 2
    ... except ImportError:
    ...     from io import StringIO # Python 3
    ...
    >>> for record in SeqIO.parse(StringIO(data), "fasta"):
    ...     print("%s %s" % (record.id, record.seq))
    Alpha ACCGGATGTA
    Beta AGGCTCGGTTA

    Use the Bio.SeqIO.read(...) function when you expect a single record
    only.
    """
    # NOTE - The above docstring has some raw \n characters needed
    # for the StringIO example, hence the whole docstring is in raw
    # string mode (see the leading r before the opening quote).
    from Bio import AlignIO

    # Hack for SFF, will need to make this more general in future
    if format in _BinaryFormats:
        mode = 'rb'
    else:
        mode = 'rU'

    # Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    if alphabet is not None and not (isinstance(alphabet, Alphabet)
                                     or isinstance(alphabet, AlphabetEncoder)):
        raise ValueError("Invalid alphabet, %r" % alphabet)

    with as_handle(handle, mode) as fp:
        # Map the file format to a sequence iterator:
        if format in _FormatToIterator:
            iterator_generator = _FormatToIterator[format]
            if alphabet is None:
                i = iterator_generator(fp)
            else:
                try:
                    i = iterator_generator(fp, alphabet=alphabet)
                except TypeError:
                    i = _force_alphabet(iterator_generator(fp), alphabet)
        elif format in AlignIO._FormatToIterator:
            # Use Bio.AlignIO to read in the alignments
            i = (r
                 for alignment in AlignIO.parse(fp, format, alphabet=alphabet)
                 for r in alignment)
        else:
            raise ValueError("Unknown format '%s'" % format)
        # This imposes some overhead... wait until we drop Python 2.4 to fix it
        for r in i:
            yield r
Example #53
0
if __name__ == "__main__":
    print("Quick test")
    from Bio import AlignIO
    from Bio.Align.Generic import Alignment

    filename = "../../Tests/GFF/multi.fna"
    format = "fasta"
    expected = FreqTable.FreqTable({
        "A": 0.25,
        "G": 0.25,
        "T": 0.25,
        "C": 0.25
    }, FreqTable.FREQ, IUPAC.unambiguous_dna)

    alignment = AlignIO.read(open(filename), format)
    for record in alignment:
        print(record.seq)
    print("=" * alignment.get_alignment_length())

    summary = SummaryInfo(alignment)
    consensus = summary.dumb_consensus(ambiguous="N")
    print(consensus)
    consensus = summary.gap_consensus(ambiguous="N")
    print(consensus)
    print("")
    print(
        summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                          axis_seq=consensus))
    print("")
    # Have a generic alphabet, without a declared gap char, so must tell
Example #54
0
        seq_dict[key].append(
            textwrap.fill(str(records_dict[key].seq),
                          width=60))  # formated in fasta

alignments = []
for k in seq_dict:
    out_fasta = os.path.join(snakemake.params['TMP_D'], k + '.fa')
    out_aln = os.path.join(snakemake.params['TMP_D'], k + '.aln')
    alignments.append(out_aln)
    with open(out_fasta, 'w') as out_fh:
        out_fh.write('\n'.join(seq_dict[k]))
    aln = MafftCommandline(quiet=True,
                           retree=1,
                           thread=cores,
                           nuc=True,
                           globalpair=True,
                           input=out_fasta)
    #        print(aln())
    with open(out_aln, 'w') as out_fh:
        out_fh.write('\n'.join(aln()))

one_big_aln = AlignIO.read(alignments[0], 'fasta')
one_big_aln.sort()
for f in alignments:
    aln = AlignIO.read(f, 'fasta')
    aln.sort()
    one_big_aln = one_big_aln + aln

with open(one_big_aln_f, 'w') as out_fh:
    AlignIO.write(one_big_aln, out_fh, 'fasta')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input')
    parser.add_argument('-o', '--output')
    parser.add_argument('-v', dest='verbose', action='store_true')
    args = parser.parse_args()

    chrsize = defaultdict(lambda: int())
    data = defaultdict(lambda: int())
    other = defaultdict(lambda: int())
    outdir = 'ortholog_maf_dm_clean'
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    maf_raw = glob.glob('./ortholog_maf_clean/*.axt.chain.prenet.net.axt.maf')
    s = re.compile(r'(\d+)$')
    print 'Clean_maf\tNumOfAlign'
    for maf in sorted(maf_raw):
        #print maf
        aligns = AlignIO.parse(maf, 'maf')
        aligns_new = '%s/%s' % (outdir, os.path.split(maf)[-1])
        aligns_clean = []

        #determine start point of orthologous region on chromosome
        sample = 200
        start_q_0 = 20000000
        start_t_0 = 20000000
        count = 0
        for align in aligns:
            count += 1
            if count > sample:
                break
            chr_qry = s.search(align[0].id).groups(0)[0] if s.search(
                align[0].id) else 'NA'
            chr_tar = s.search(align[1].id).groups(0)[0] if s.search(
                align[1].id) else 'NA'
            start_q = align[0].annotations['start']
            start_t = align[1].annotations['start']
            if int(chr_qry) == int(chr_tar):
                if int(start_t) < start_t_0:
                    start_t_0 = int(start_t)
                    start_q_0 = int(start_q)
        print start_q_0, start_t_0

        #dynamic programming of identification of orthologous alignment
        max_interval = 4000000  # we allow adjacent orthologous region to have less than 500 kb interval
        aligns = AlignIO.parse(maf, 'maf')
        flag = 0  # allowed for backward
        for align in aligns:
            chr_qry = s.search(align[0].id).groups(0)[0] if s.search(
                align[0].id) else 'NA'
            chr_tar = s.search(align[1].id).groups(0)[0] if s.search(
                align[1].id) else 'NA'
            start_q = align[0].annotations['start']
            start_t = align[1].annotations['start']
            size_q = align[0].annotations['size']
            size_t = align[1].annotations['size']
            strand = align[1].annotations['strand']
            if not chrsize.has_key(chr_qry):
                chrsize[chr_qry] = align[0].annotations['srcSize']
            print '>%s\t%s\t%s\t%s\t%s' % (chr_qry, chr_tar, start_q, start_t,
                                           strand)

            if int(chr_qry) == int(chr_tar) and strand == '+1' and int(
                    start_q) >= int(start_q_0):
                ref_d = abs(int(start_q) - int(start_q_0))
                tar_d = abs(int(start_t) - int(start_t_0))
                ##orthologous alignment
                print '%s\t%s\t%s\t%s\t%s' % (chr_qry, chr_tar, start_q,
                                              start_t, flag)
                ##small step, not allowed to backward
                if flag == 0:
                    #allowed to backward within 200kb
                    #if int(start_t) < int(start_t_0) - 200000:
                    #    print 'flag0 no'
                    #    other[chr_qry] += size_q
                    #    continue
                    if (tar_d < ref_d + max_interval):
                        aligns_clean.append(align)
                        data[chr_qry] += size_q
                        start_q_0 = int(start_q)
                        start_t_0 = int(start_t)
                        flag = update_flag(int(flag), int(ref_d), int(tar_d))
                        print 'flag0 yes'
                    else:
                        print 'flag0 no'
                        other[chr_qry] += size_q
                ##previous step is large, allowed for backward
                elif flag == 1:
                    if (tar_d < ref_d + max_interval):
                        aligns_clean.append(align)
                        data[chr_qry] += size_q
                        if int(start_t) < int(start_t_0):
                            flag = 0
                        start_q_0 = int(start_q)
                        start_t_0 = int(start_t)
                        print 'flag1 yes'
                    else:
                        print 'flag1 no'
                        other[chr_qry] += size_q
                #if ref_d < 100000 and tar_d < 1000000 and flag == 0:
                #    flag = 0
                #elif ref_d < 100000 and tar_d > 100000 and flag == 0:
                #    flag = 1
                #if (tar_d < ref_d + max_interval):
                #    aligns_clean.append(align)
                #    data[chr_qry] += size_q
                #    start_q_0 = int(start_q)
                #    start_t_0 = int(start_t)
                ##not orthologous alignment
                #else:
                #    other[chr_qry] += size_q
            ##not orthologous alignment
            else:
                other[chr_qry] += size_q

        count = AlignIO.write(aligns_clean, aligns_new, 'maf')
        print aligns_new, count
    print 'Chr\tSize\tAlignedSize\tAlignedRate\tRawAlignedRate'
    total = 0
    aligned = 0
    aligned_o = 0
    for c in sorted(chrsize.keys()):
        print 'Chr%s\t%s\t%s\t%s\t%s' % (
            c, chrsize[c], data[c], float(data[c]) / float(chrsize[c]),
            (float(data[c]) + float(other[c])) / float(chrsize[c]))
        total += int(chrsize[c])
        aligned += int(data[c])
        aligned_o += int(other[c])
    print 'Total\t%s\t%s\t%s\t%s' % (
        total, aligned, float(aligned) / float(total),
        (float(aligned) + float(aligned_o)) / float(total))
Example #56
0
from Bio.Alphabet import generic_dna
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq

align = MultipleSeqAlignment([
    SeqRecord(Seq('ACGTACGTACGTACGT', generic_dna), id='quercus_robur'),
    SeqRecord(Seq('ACGTCCGTACTTACGA', generic_dna), id='quercus_ilex'),
    SeqRecord(Seq('CCGTCCGGACATACGA', generic_dna), id='quercus_rubra'),
    SeqRecord(Seq('AGGTCAGTACTTGCGA', generic_dna), id='quercus_macrocarpa')
])
# ---- Begin exercise answer ----

# Write alignment to file
from Bio import AlignIO
AlignIO.write(align, 'example_alignment.phylip', 'phylip')

# Read alignment from file (use Bio.AlignIO.read for single alignments, Bio.AlignIO.parse for multiple alignments)
from Bio import AlignIO
test_alignment_read = AlignIO.read('example_alignment.phylip', 'phylip')

print(test_alignment_read)
print('End: Q6')

# 7. Write a function that BLAST-searches a given sequence in GenBank, prints out how many
# matches passed a certain threshold, and returns the blast search results.


def blastSearch(sequence_record, less_than_threshold):
    # Import required packages
    from Bio.Seq import Seq
def write_AlignIO_protein():
    """Convert hedgehog.aln to a phylip file"""
    assert 1 == AlignIO.convert("Clustalw/hedgehog.aln", "clustal",
                                "Phylip/hedgehog.phy", "phylip")
Example #58
0
def PPHMMDBConstruction (
	GenomeSeqFile,
	ShelveDir,
	
	ProteinLength_Cutoff		= 100,
	IncludeIncompleteGenomes	= True,
	
	BLASTp_evalue_Cutoff		= 1E-3,
	BLASTp_PercentageIden_Cutoff	= 50,
	BLASTp_QueryCoverage_Cutoff	= 75,
	BLASTp_SubjectCoverage_Cutoff	= 75,
	BLASTp_num_alignments		= 1000000,
	BLASTp_N_CPUs			= 20,
	
	MUSCLE_GapOpenCost		= -3.0,
	MUSCLE_GapExtendCost		= -0.0,
	
	ProtClustering_MCLInflation	= 2,
	
	N_AlignmentMerging		= 0,
	
	HHsuite_evalue_Cutoff		= 1E-6,
	HHsuite_pvalue_Cutoff		= 0.05,
	HHsuite_N_CPUs			= 10,
	HHsuite_QueryCoverage_Cutoff	= 85,
	HHsuite_SubjectCoverage_Cutoff	= 85,
	
	PPHMMClustering_MCLInflation	= 5,
	
	HMMER_PPHMMDB_ForEachRoundOfPPHMMMerging = True,
	):
	print "################################################################################"
	print "#Build a database of virus protein profile hidden Markov models (PPHMMs)       #"
	print "################################################################################"
	'''
	Build a database of virus protein profile hidden Markov models (PPHMMs).
	---------------------------------------------
	'''
	
	################################################################################
	print "- Define dir/file paths"
	################################################################################
	print "\tto BLASTp shelve directory"
	#-------------------------------------------------------------------------------
	BLASTMainDir		= ShelveDir+"/BLAST"
	if os.path.exists(BLASTMainDir):
		_ = subprocess.call("rm -rf %s" %BLASTMainDir, shell = True)
	
	os.makedirs(BLASTMainDir)
	
	print "\t\tto BLASTp query file"
	#-------------------------------------------------------------------------------
	BLASTQueryFile		= BLASTMainDir+"/Query.fasta"
	print "\t\tto BLASTp subject file"
	#-------------------------------------------------------------------------------
	BLASTSubjectFile	= BLASTMainDir+"/Subjects.fasta"
	print "\t\tto BLASTp output file"
	#-------------------------------------------------------------------------------
	BLASTOutputFile		= BLASTMainDir+"/BLASTOutput.txt"
	print "\t\tto BLASTp bit score matrix file"
	#-------------------------------------------------------------------------------
	BLASTBitScoreFile	= BLASTMainDir+"/BitScoreMat.txt"
	print "\t\tto protein cluster file"
	#-------------------------------------------------------------------------------
	BLASTProtClusterFile	= BLASTMainDir+"/ProtClusters.txt"
	print "\t\tto protein cluster directory"
	#-------------------------------------------------------------------------------
	ClustersDir		= BLASTMainDir+"/Clusters";os.makedirs(ClustersDir)
	
	print "\tto HMMER shelve directory"
	#-------------------------------------------------------------------------------
	HMMERDir		= ShelveDir+"/HMMER"
	if os.path.exists(HMMERDir):
		_ = subprocess.call("rm -rf %s" %HMMERDir, shell = True)
	
	os.makedirs(HMMERDir)
	
	print "\t\tto HMMER PPHMM directory"
	#-------------------------------------------------------------------------------
	HMMER_PPHMMDir		= HMMERDir+"/HMMER_PPHMMs";os.makedirs(HMMER_PPHMMDir)
	print "\t\tto HMMER PPHMM database directory"
	#-------------------------------------------------------------------------------
	HMMER_PPHMMDBDir	= HMMERDir+"/HMMER_PPHMMDB";os.makedirs(HMMER_PPHMMDBDir)
	print "\t\t\tto HMMER PPHMM database"
	#-------------------------------------------------------------------------------
	HMMER_PPHMMDB		= HMMER_PPHMMDBDir+"/HMMER_PPHMMDB"
	
	if N_AlignmentMerging != 0:
		print "\tto HHsuite shelve directory"
		#-------------------------------------------------------------------------------
		HHsuiteDir	= ShelveDir+"/HHsuite"
		if os.path.exists(HHsuiteDir):
			_ = subprocess.call("rm -rf %s" %HHsuiteDir, shell = True)
		
		os.makedirs(HHsuiteDir)
		
		print "\t\tto HHsuite PPHMM directory"
		#-------------------------------------------------------------------------------
		HHsuite_PPHMMDir = HHsuiteDir + "/HHsuite_PPHMMs";os.makedirs(HHsuite_PPHMMDir)
		print "\t\tto HHsuite PPHMM database directory"
		#-------------------------------------------------------------------------------
		HHsuite_PPHMMDBDir= HHsuiteDir +"/HHsuite_PPHMMDB";os.makedirs(HHsuite_PPHMMDBDir)
		print "\t\t\tto HHsuite PPHMM database"
		#-------------------------------------------------------------------------------
		HHsuite_PPHMMDB	= HHsuite_PPHMMDBDir+"/HHsuite_PPHMMDB"
	
	print "\tto program output shelve"
	#-------------------------------------------------------------------------------
	VariableShelveDir 	= ShelveDir+"/Shelves"
	
	################################################################################
	print "- Retrieve variables"
	################################################################################
	if IncludeIncompleteGenomes == True:
		print "\tfrom ReadGenomeDescTable.AllGenomes.shelve"
		#-------------------------------------------------------------------------------
		VariableShelveFile = VariableShelveDir+"/ReadGenomeDescTable.AllGenomes.shelve"
		Parameters = shelve.open(VariableShelveFile)
		for key in [	"BaltimoreList",
				"OrderList",
				"FamilyList",
				"SubFamList",
				"GenusList",
				"VirusNameList",
				"TaxoGroupingList",
				"SeqIDLists",
				"TranslTableList"]:
			globals()[key] = Parameters[key]
			print "\t\t"+key
		
		Parameters.close()
	elif IncludeIncompleteGenomes == False:
		print "\tfrom ReadGenomeDescTable.CompleteGenomes.shelve"
		#-------------------------------------------------------------------------------
		VariableShelveFile = VariableShelveDir+"/ReadGenomeDescTable.CompleteGenomes.shelve"
		Parameters = shelve.open(VariableShelveFile)
		for key in [	"BaltimoreList",
				"OrderList",
				"FamilyList",
				"SubFamList",
				"GenusList",
				"VirusNameList",
				"TaxoGroupingList",
				"SeqIDLists",
				"TranslTableList"]:
			globals()[key] = Parameters[key]
			print "\t\t"+key
		
		Parameters.close()
	
	if not os.path.isfile(GenomeSeqFile):
		################################################################################
		print "- Download GenBank file"
		################################################################################
		print "GenomeSeqFile doesn't exist. GRAViTy is downloading the GenBank file(s)"
		print "Here are the accession numbers to be downloaded: "
		print "\n".join(map(lambda x:"\n".join(x), SeqIDLists))
		DownloadGenBankFile (GenomeSeqFile = GenomeSeqFile, SeqIDLists = SeqIDLists)
	
	################################################################################
	print "- Read GenBank file"
	################################################################################
	GenBankDict = SeqIO.index(GenomeSeqFile, "genbank")
	GenBankDict = {k.split(".")[0]:v for k,v in GenBankDict.iteritems()}
	
	################################################################################
	print "- Extract/predict protein sequences from virus genomes, excluding proteins with lengthes <%s aa"%ProteinLength_Cutoff
	################################################################################
	ProtList	= []
	ProtIDList	= []
	N_Viruses	= len(SeqIDLists)
	Virus_i		= 1.0
	for SeqIDList, TranslTable, BaltimoreGroup, Order, Family, SubFam, Genus, VirusName, TaxoGrouping in zip(SeqIDLists, TranslTableList, BaltimoreList, OrderList, FamilyList, SubFamList, GenusList, VirusNameList, TaxoGroupingList):
		for SeqID in SeqIDList:
			GenBankRecord	= GenBankDict[SeqID]
			GenBankID	= GenBankRecord.name
			GenBankFeatures	= GenBankRecord.features
			#Extract protein sequences
			#-------------------------------------------------------------------------------
			ContainProtAnnotation = 0
			for Feature in GenBankFeatures:
				if(Feature.type == 'CDS' and Feature.qualifiers.has_key("protein_id") and Feature.qualifiers.has_key("translation")):
					ContainProtAnnotation = 1
					try:
						ProtName = Feature.qualifiers["product"][0]
					except KeyError:
						try:
							ProtName = Feature.qualifiers["gene"][0]
						except KeyError:
							try:
								ProtName = Feature.qualifiers["note"][0]
							except KeyError:
								ProtName = "Hypothetical protein"
					ProtID = Feature.qualifiers["protein_id"][0]
					ProtSeq = Feature.qualifiers["translation"][0]
					if len(ProtSeq) >= ProteinLength_Cutoff:
						ProtRecord = SeqRecord(	Seq(ProtSeq),
									id = GenBankID+"|"+ProtID,
									name = GenBankID+"|"+ProtID,
									description = ProtName,
									annotations = {'taxonomy':[BaltimoreGroup, Order, Family, SubFam, Genus, VirusName, TaxoGrouping]})
						ProtList.append(ProtRecord)
						ProtIDList.append(GenBankID+"|"+ProtID)
			if ContainProtAnnotation == 0:	#if the genome isn't annotated with any ORFs
				#Identifying ORFs
				#-------------------------------------------------------------------------------
				if TranslTable==1:
					Starts = "---M------**--*----M---------------M----------------------------"
				elif TranslTable==2:
					Starts = "----------**--------------------MMMM----------**---M------------"
				elif TranslTable==3:
					Starts = "----------**----------------------MM----------------------------"
				elif TranslTable==4:
					Starts = "--MM------**-------M------------MMMM---------------M------------"
				elif TranslTable==5:
					Starts = "---M------**--------------------MMMM---------------M------------"
				elif TranslTable==6:
					Starts = "--------------*--------------------M----------------------------"
				elif TranslTable==7:
					Starts = "--MM------**-------M------------MMMM---------------M------------"
				elif TranslTable==8:
					Starts = "---M------**--*----M---------------M----------------------------"
				elif TranslTable==9:
					Starts = "----------**-----------------------M---------------M------------"
				elif TranslTable==10:
					Starts = "----------**-----------------------M----------------------------"
				elif TranslTable==11:
					Starts = "---M------**--*----M------------MMMM---------------M------------"
				elif TranslTable==12:
					Starts = "----------**--*----M---------------M----------------------------"
				elif TranslTable==13:
					Starts = "---M------**----------------------MM---------------M------------"
				elif TranslTable==14:
					Starts = "-----------*-----------------------M----------------------------"
				elif TranslTable==15:
					Starts = "----------*---*--------------------M----------------------------"
				elif TranslTable==16:
					Starts = "----------*---*--------------------M----------------------------"
				elif TranslTable==17:
					print "Genetic code table 17 doesn't exist. Use the stardard code"
					Starts = "---M------**--*----M---------------M----------------------------"
				elif TranslTable==18:
					print "Genetic code table 18 doesn't exist. Use the stardard code"
					Starts = "---M------**--*----M---------------M----------------------------"
				elif TranslTable==19:
					print "Genetic code table 19 doesn't exist. Use the stardard code"
					Starts = "---M------**--*----M---------------M----------------------------"
				elif TranslTable==20:
					print "Genetic code table 20 doesn't exist. Use the stardard code"
					Starts = "---M------**--*----M---------------M----------------------------"
				elif TranslTable==21:
					Starts = "----------**-----------------------M---------------M------------"
				elif TranslTable==22:
					Starts = "------*---*---*--------------------M----------------------------"
				elif TranslTable==23:
					Starts = "--*-------**--*-----------------M--M---------------M------------"
				elif TranslTable==24:
					Starts = "---M------**-------M---------------M---------------M------------"
				elif TranslTable==25:
					Starts = "---M------**-----------------------M---------------M------------"
				elif TranslTable==26:
					Starts = "----------**--*----M---------------M----------------------------"
				elif TranslTable==27:
					Starts = "--------------*--------------------M----------------------------"
				elif TranslTable==28:
					Starts = "----------**--*--------------------M----------------------------"
				elif TranslTable==29:
					Starts = "--------------*--------------------M----------------------------"
				elif TranslTable==30:
					Starts = "--------------*--------------------M----------------------------"
				elif TranslTable==31:
					Starts = "----------**-----------------------M----------------------------"
				else:
					print "Genetic code table isn't specified or is out of range. Use the stardard code"
					Starts = "---M------**--*----M---------------M----------------------------"
				
				CodonList = [Base1+Base2+Base3 for Base1 in "TCAG" for Base2 in "TCAG" for Base3 in "TCAG"]
				
				StartCodonList = []
				StopCodonList = []
				for i,j in enumerate(Starts):
					if j == "M":
						StartCodonList.append(CodonList[i])
					if j == "*":
						StopCodonList.append(CodonList[i])
				
				GenBankSeq = GenBankRecord.seq
				SeqLength = len(GenBankSeq)
				ORF_i = 0
				for strand, nuc in [(+1, GenBankSeq), (-1, GenBankSeq.reverse_complement())]:
					for frame in range(3):
						length = 3 * ((SeqLength-frame) // 3)					#Multiple of three
						nuc_inframe = nuc[frame:(frame+length)]					#In-frame nucleotide sequence 
						nuc_codonList = [str(nuc_inframe[i:i+3]) for i in range(0, length, 3)]	#Split the in-frame nucleotide sequence into codons
						
						StopCodon_indices = [i for i, codon in enumerate(nuc_codonList) if codon in StopCodonList] #Find stop codons
						Coding_Start_IndexList = np.array([-1]+StopCodon_indices)+1
						Coding_End_IndexList = np.array(StopCodon_indices+[len(nuc_codonList)])
						
						ProtSeqList = []
						for i, j in zip(Coding_Start_IndexList, Coding_End_IndexList):
							for k, codon in enumerate(nuc_codonList[i:j]):
								if codon in StartCodonList:
									ProtSeqList.append(Seq("".join(nuc_codonList[i:j][k:])).translate(table = TranslTable))
									break
						
						for ProtSeq in ProtSeqList:
							if len(ProtSeq) >= ProteinLength_Cutoff:	#Exclude protein sequences with <'ProteinLength_Cutoff' aa
								ProtRecord = SeqRecord(	ProtSeq,
											id = GenBankID+"|ORF%s"%ORF_i,
											name = GenBankID+"|ORF%s"%ORF_i,
											description = "Hypothetical protein",
											annotations = {'taxonomy':[BaltimoreGroup, Order, Family, SubFam, Genus, VirusName, TaxoGrouping]})
								ProtList.append(ProtRecord)
								ProtIDList.append(GenBankID+"|ORF%s"%ORF_i)
								ORF_i = ORF_i + 1
		#Progress bar
		sys.stdout.write("\033[K" + "Extract protein sequences: [%-20s] %d/%d viruses" % ('='*int(Virus_i/N_Viruses*20), Virus_i, N_Viruses) + "\r")
		sys.stdout.flush()
		Virus_i = Virus_i + 1.0
	
	sys.stdout.write("\033[K")
	sys.stdout.flush()
	
	ProtIDList = np.array(ProtIDList)
	################################################################################
	print "- ALL-VERSUS-ALL BLASTp"
	################################################################################
	print "\tMake BLASTp database"
	#-------------------------------------------------------------------------------
	with open(BLASTSubjectFile, "w") as BLASTSubject_txt:
		SeqIO.write(ProtList, BLASTSubject_txt, "fasta")
	
	_ = subprocess.Popen("makeblastdb -in %s -dbtype prot" %BLASTSubjectFile, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
	out, err = _.communicate()
	if err != "":
		print "Something is wrong with makeblastdb:"
		print "#"*50+"out"+"#"*50
		print out
		print "#"*50+"err"+"#"*50
		print err
		print "_"*100
		while True:
			Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y")
			if Input == "N" or Input == "n":
				raise SystemExit("GRAViTy terminated.")
			elif Input == "Y" or Input == "y":
				print "Continue GRAViTy."
				break
			else:
				print "Input can only be 'Y' or 'N'."
	
	print "\tPerforme ALL-VERSUS-ALL BLASTp analysis"
	#-------------------------------------------------------------------------------
	BitScoreMat	= []
	SeenPair	= {}
	SeenPair_i	= 0
	N_ProtSeqs	= len(ProtList)
	#Set Blastp outfile format
	#-------------------------------------------------------------------------------
	BLASTp_outfmt	= '"6 qseqid sseqid pident qcovs qlen slen evalue bitscore"'
	for ProtSeq_i in range(N_ProtSeqs):
		#BLAST query fasta file
		#-------------------------------------------------------------------------------
		BLASTQuery = ProtList[ProtSeq_i]
		with open(BLASTQueryFile, "w") as BLASTQuery_txt:
			p = SeqIO.write(BLASTQuery, BLASTQuery_txt, "fasta")
		
		#Perform BLASTp
		#-------------------------------------------------------------------------------
		_ = subprocess.Popen('blastp -query %s -db %s -out %s -evalue %s -outfmt %s -num_alignments %s -num_threads %s' %(	BLASTQueryFile,
																	BLASTSubjectFile,
																	BLASTOutputFile,
																	BLASTp_evalue_Cutoff,
																	BLASTp_outfmt,
																	BLASTp_num_alignments,
																	BLASTp_N_CPUs), stdout = subprocess.PIPE, stderr = subprocess.PIPE,
																	shell = True)
		out, err = _.communicate()
		if err != "":
			print "Something is wrong with blastp (protein ID = %s):"%ProtList[ProtSeq_i].id
			print "#"*50+"out"+"#"*50
			print out
			print "#"*50+"err"+"#"*50
			print err
			print "_"*100
			while True:
				Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y")
				if Input == "N" or Input == "n":
					raise SystemExit("GRAViTy terminated.")
				elif Input == "Y" or Input == "y":
					print "Continue GRAViTy."
					break
				else:
					print "Input can only be 'Y' or 'N'."
		
		#BitScoreMat conditioned on PIden, QCovs, and SCovs
		#-------------------------------------------------------------------------------
		if os.stat(BLASTOutputFile).st_size != 0: #if BLAST returns something...
			with open(BLASTOutputFile, "r") as BLASTOutput_txt:
				for BLASTHit in BLASTOutput_txt.readlines():
					if BLASTHit == "\n": break
					Line			= BLASTHit.split("\t")
					qseqid			= Line[0]
					sseqid			= Line[1]
					pident			= float(Line[2])
					qcovs			= float(Line[3])
					qlen			= float(Line[4])
					slen			= float(Line[5])
					evalue			= float(Line[6])
					bitscore		= float(Line[7][:-1])
					[SeqID_I, SeqID_II]	= sorted([qseqid, sseqid])
					Pair			= ", ".join([SeqID_I, SeqID_II])
					if ((qseqid != sseqid) and (pident >= BLASTp_PercentageIden_Cutoff) and (qcovs >= BLASTp_QueryCoverage_Cutoff) and ((qcovs*qlen/slen) >= BLASTp_SubjectCoverage_Cutoff)):
						if Pair in SeenPair: #If the pair has already been seen...
							if bitscore > BitScoreMat[SeenPair[Pair]][2]: #and if the new bitscore is higher...
								BitScoreMat[SeenPair[Pair]][2] = bitscore
						else:
							SeenPair[Pair] = SeenPair_i
							BitScoreMat.append([SeqID_I, SeqID_II, bitscore])
							SeenPair_i = SeenPair_i+1
		
		#Progress bar
		sys.stdout.write("\033[K" + "BLASTp: [%-20s] %d/%d proteins" % ('='*int(float(ProtSeq_i+1)/N_ProtSeqs*20), ProtSeq_i+1, N_ProtSeqs) + "\r")
		sys.stdout.flush()
	
	sys.stdout.write("\033[K")
	sys.stdout.flush()
	
	BitScoreMat = np.array(BitScoreMat)
	print "\tSave protein-protein similarity scores (BLASTp bit scores)"
	#-------------------------------------------------------------------------------
	np.savetxt(	fname	= BLASTBitScoreFile,
			X	= BitScoreMat,
			fmt	= '%s',
			delimiter= "\t",
			header	= "SeqID_I\tSeqID_II\tBit score")
	
	################################################################################
	print "- Cluster protein sequences based on BLASTp bit scores, using the MCL algorithm"
	################################################################################
	_ = subprocess.Popen("mcl %s --abc -o %s -I %s" %(BLASTBitScoreFile, BLASTProtClusterFile, ProtClustering_MCLInflation), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
	err, out = _.communicate()
	if err != "":
		print "Something is wrong with mcl:"
		print "#"*50+"out"+"#"*50
		print out
		print "#"*50+"err"+"#"*50
		print err
		print "_"*100
		while True:
			Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y")
			if Input == "N" or Input == "n":
				raise SystemExit ("GRAViTy terminated.")
			elif Input == "Y" or Input == "y":
				print "Continue GRAViTy."
				break
			else:
				print "Input can only be 'Y' or 'N'."
	
	SeenProtIDList = []
	with open(BLASTProtClusterFile, 'r') as BLASTProtCluster_txt:
		for Cluster in BLASTProtCluster_txt.readlines():
			SeenProtIDList.extend(Cluster.split("\r\n")[0].split("\n")[0].split("\t"))
	
	with open(BLASTProtClusterFile, 'a') as BLASTProtCluster_txt:
		BLASTProtCluster_txt.write("\n".join(list(set(ProtIDList)-set(SeenProtIDList))))
	
	################################################################################
	print "- Make protein alignments"
	################################################################################
	N_Clusters		= LineCount(BLASTProtClusterFile)+1 #Count the number of clusters
	Cluster_i		= 0
	Cluster_MetaDataDict	= {}
	with open(BLASTProtClusterFile, 'r') as BLASTProtCluster_txt:
		for Cluster in BLASTProtCluster_txt.readlines():
			HitList		= []
			TaxoLists	= []
			DescList	= []
			Cluster		= Cluster.split("\n")[0].split("\t")
			for ProtID in Cluster:
				HitList.append(ProtList[np.where(ProtIDList == ProtID)[0][0]])
				TaxoLists.append(HitList[-1].annotations['taxonomy'])
				DescList.append(HitList[-1].description.replace(", "," ").replace(","," ").replace(": ","_").replace(":","_").replace("; "," ").replace(";"," ").replace(" (","/").replace("(","/").replace(")",""))
			
			#Cluster file
			#-------------------------------------------------------------------------------
			UnAlnClusterFile = ClustersDir+"/Cluster_%s.fasta" %Cluster_i
			with open(UnAlnClusterFile, "w") as UnAlnClusterTXT:
				p = SeqIO.write(HitList, UnAlnClusterTXT, "fasta")
			
			#align cluster using muscle
			#-------------------------------------------------------------------------------
			AlnClusterFile = ClustersDir+"/Cluster_%s.fasta" %Cluster_i
			_ = subprocess.Popen("muscle -in %s -out %s -gapopen %s -gapextend %s" %(	UnAlnClusterFile,
													AlnClusterFile,
													MUSCLE_GapOpenCost,
													MUSCLE_GapExtendCost),
													stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
			err, out = _.communicate()
			if err != "":
				print "Something is wrong with muscle (Cluster_%s):"%Cluster_i
				print "#"*50+"out"+"#"*50
				print out
				print "#"*50+"err"+"#"*50
				print err
				print "_"*100
				while True:
					Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y")
					if Input == "N" or Input == "n":
						raise SystemExit("GRAViTy terminated.")
					elif Input == "Y" or Input == "y":
						print "Continue GRAViTy."
						break
					else:
						print "Input can only be 'Y' or 'N'."
			
			#Cluster annotations
			#-------------------------------------------------------------------------------
			Cluster_MetaDataDict[Cluster_i] = {	"Cluster":Cluster,
								"DescList":DescList,
								"TaxoLists":TaxoLists,
								"AlignmentLength":AlignIO.read(AlnClusterFile, "fasta").get_alignment_length()
								}
			
			Cluster_i = Cluster_i+1
			sys.stdout.write("\033[K" + "Make protein alignments: [%-20s] %d/%d alignments" % ('='*int(float(Cluster_i)/N_Clusters*20), Cluster_i, N_Clusters) + "\r")
			sys.stdout.flush()
	
	sys.stdout.write("\033[K")
	sys.stdout.flush()
	
	if N_AlignmentMerging != 0:
		################################################################################
		if N_AlignmentMerging > 0:
			print "- Merge protein alignments, %s rounds of merging" %N_AlignmentMerging
		elif N_AlignmentMerging < 0:
			print "- Merge protein alignments until exhausted"
		################################################################################
		print "\tMake HHsuite PPHMMs from protein alignments"
		#-------------------------------------------------------------------------------
		for Cluster_i in range(len(Cluster_MetaDataDict)):
			AlnClusterFile		= ClustersDir+"/Cluster_%s.fasta" %Cluster_i
			HHsuite_PPHMMFile	= HHsuite_PPHMMDir+"/PPHMM_%s.hhm" %Cluster_i
			_ = subprocess.Popen("hhmake -i %s -o %s -seq %s -name Cluster_%s -id 100 -M 50 -v 0" %(	AlnClusterFile,
															HHsuite_PPHMMFile,
															len(Cluster_MetaDataDict[Cluster_i]["Cluster"])+1,
															Cluster_i),
															stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
			out, err = _.communicate()
			if err != "":
				print "Something is wrong with turning Cluster_%s into a PPHMM by hhmake." %Cluster_i
				print "#"*50+"out"+"#"*50
				print out
				print "#"*50+"err"+"#"*50
				print err
				print "_"*100
				while True:
					Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y")
					if Input == "N" or Input == "n":
						raise SystemExit ("GRAViTy terminated.")
					elif Input == "Y" or Input == "y":
						print "Continue GRAViTy."
						break
					else:
						print "Input can only be 'Y' or 'N'."
			
			#Progress bar
			sys.stdout.write("\033[K" + "Make HHsuite PPHMMs: [%-20s] %d/%d PPHMMs" % ('='*int(float(Cluster_i+1)/len(Cluster_MetaDataDict)*20), Cluster_i+1, len(Cluster_MetaDataDict)) + "\r")
			sys.stdout.flush()
		
		sys.stdout.write("\033[K")
		sys.stdout.flush()
		
		print "\tMake a HHsuite PPHMM DB"
		#-------------------------------------------------------------------------------
		_ = subprocess.Popen("ffindex_build -s %s_hhm.ffdata %s_hhm.ffindex %s" %(HHsuite_PPHMMDB, HHsuite_PPHMMDB, HHsuite_PPHMMDir), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
		out, err = _.communicate()
		
		print "\tMerge protein alignments"
		#-------------------------------------------------------------------------------
		AlignmentMerging_i_round = 0
		while True:
			if AlignmentMerging_i_round >= N_AlignmentMerging and N_AlignmentMerging >= 0:
				print "Alignment merging complete"
				break
			
			if HMMER_PPHMMDB_ForEachRoundOfPPHMMMerging == True:
				print "\t\tHMMER_PPHMMDB_ForEachRoundOfPPHMMMerging == True. Make a HMMER PPHMM DB. (Round %s)" %AlignmentMerging_i_round
				#-------------------------------------------------------------------------------
				_ = Make_HMMER_PPHMM_DB(	HMMER_PPHMMDir = HMMER_PPHMMDir,
								HMMER_PPHMMDB = HMMER_PPHMMDBDir+"/HMMER_PPHMMDB_%s" %AlignmentMerging_i_round,
								ClustersDir = ClustersDir,
								Cluster_MetaDataDict = Cluster_MetaDataDict)
				
				_ = subprocess.Popen("find %s -type f -name '*.hmm' -delete" %HMMER_PPHMMDir, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
				out, err = _.communicate()
			
			print "\t\tRound %s"%(AlignmentMerging_i_round + 1)
			print "\t\t\tDetermine PPHMM-PPHMM similarity scores (ALL-VERSUS-ALL hhsearch)"
			#-------------------------------------------------------------------------------
			hhsearchDir		= HHsuiteDir+"/hhsearch_"+"".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10)); os.makedirs(hhsearchDir)
			hhsearchOutFile		= hhsearchDir+"/hhsearch.stdout.hhr"
			N_PPHMMs		= LineCount("%s_hhm.ffindex"%HHsuite_PPHMMDB)
			
			SeenPair		= {}
			SeenPair_i		= 0
			PPHMMSimScoreCondensedMat= []
			for PPHMM_i in range(0, N_PPHMMs):
				HHsuite_PPHMMFile = HHsuite_PPHMMDir + "/PPHMM_%s.hhm" %PPHMM_i
				_ = subprocess.Popen("hhsearch -i %s -d %s -o %s -e %s -E %s -z 1 -b 1 -id 100 -global -v 0 -cpu %s" %(	HHsuite_PPHMMFile,
																	HHsuite_PPHMMDB+"_hhm.ffdata",
																	hhsearchOutFile,
																	HHsuite_evalue_Cutoff,
																	HHsuite_evalue_Cutoff,
																	HHsuite_N_CPUs,
																	),
																	stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
				out, err = _.communicate()
				if err != "":
					print "Something is wrong with hhsearching PPHMM %s againt the PPHMM database" %PPHMM_i
					print "#"*50+"out"+"#"*50
					print out
					print "#"*50+"err"+"#"*50
					print err
					print "_"*100
					while True:
						Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y")
						if Input == "N" or Input == "n":
							raise SystemExit ("GRAViTy terminated.")
						elif Input == "Y" or Input == "y":
							print "Continue GRAViTy."
							break
						else:
							print "Input can only be 'Y' or 'N'."
				
				with open(hhsearchOutFile, 'r') as hhsearchOut_txt:
					Content		= hhsearchOut_txt.readlines()
					QueryLength	= int(Content[1].split()[1])
					for Line in Content[9:]:
						if Line == "\n":
							break
						else:
							Line  		= Line.replace("("," ").replace(")"," ").split()
							PPHMM_j		= int(Line[1].split("_")[1])
							evalue		= float(Line[3])
							pvalue		= float(Line[4])
							PPHMMSimScore	= float(Line[5])
							Col		= float(Line[7])
							SubjectLength	= int(Line[10])
							qcovs		= Col/QueryLength*100
							scovs		= Col/SubjectLength*100
							if (evalue <= HHsuite_evalue_Cutoff and pvalue <= HHsuite_pvalue_Cutoff and qcovs >= HHsuite_QueryCoverage_Cutoff and scovs >= HHsuite_SubjectCoverage_Cutoff):
								Pair	= ", ".join(sorted(map(str,[PPHMM_i, PPHMM_j])))
								if Pair in SeenPair: #If the pair has already been seen...
									if PPHMMSimScore > PPHMMSimScoreCondensedMat[SeenPair[Pair]][2]: #and if the new PPHMMSimScore is higher...
										PPHMMSimScoreCondensedMat[SeenPair[Pair]][2] = PPHMMSimScore
								else:
									SeenPair[Pair] = SeenPair_i
									PPHMMSimScoreCondensedMat.append([PPHMM_i, PPHMM_j, PPHMMSimScore])
									SeenPair_i = SeenPair_i+1
				
				_ = subprocess.Popen("rm %s" %hhsearchOutFile, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
				out, err = _.communicate()
				
				#Progress bar
				sys.stdout.write("\033[K" + "hhsearch: [%-20s] %d/%d PPHMMs" % ('='*int(float(PPHMM_i+1)/N_PPHMMs*20), PPHMM_i+1, N_PPHMMs) + "\r")
				sys.stdout.flush()
			
			sys.stdout.write("\033[K")
			sys.stdout.flush()
			
			PPHMMSimScoreCondensedMat	= np.array(PPHMMSimScoreCondensedMat)
			PPHMMSimScoreMat		= np.zeros((N_PPHMMs, N_PPHMMs))
			PPHMMSimScoreMat[map(int, PPHMMSimScoreCondensedMat[:,0]), map(int, PPHMMSimScoreCondensedMat[:,1])] = map(float, PPHMMSimScoreCondensedMat[:,2])
			PPHMMSimScoreMat[map(int, PPHMMSimScoreCondensedMat[:,1]), map(int, PPHMMSimScoreCondensedMat[:,0])] = map(float, PPHMMSimScoreCondensedMat[:,2])
			PPHMMSimScoreCondensedMat	= np.array([PPHMMSimScorePair for PPHMMSimScorePair in PPHMMSimScoreCondensedMat if PPHMMSimScorePair[0] < PPHMMSimScorePair[1]])
			
			PPHMMSimScoreCondensedMatFile	= hhsearchDir+"/PPHMMSimScoreCondensedMat.txt"
			np.savetxt(	fname	= PPHMMSimScoreCondensedMatFile,
					X	= PPHMMSimScoreCondensedMat,
					fmt	= '%s',
					delimiter= "\t",
					header	= "PPHMM_i\tPPHMM_j\tPPHMMSimScore")
			
			print "\t\t\tCluster PPHMMs based on hhsearch scores, using the MCL algorithm"
			#-------------------------------------------------------------------------------
			PPHMMClustersFile	= hhsearchDir+"/PPHMMClusters.txt"
			_ = subprocess.Popen("mcl %s --abc -o %s -I %s" %(PPHMMSimScoreCondensedMatFile, PPHMMClustersFile, PPHMMClustering_MCLInflation), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
			_, out = _.communicate()
			
			SeenProtIDList = []
			with open(PPHMMClustersFile, 'r') as PPHMMClusters_txt:
				for Cluster in PPHMMClusters_txt.readlines():
					SeenProtIDList.extend(Cluster.split("\n")[0].split("\t"))
			
			with open(PPHMMClustersFile, 'a') as PPHMMClusters_txt:
				PPHMMClusters_txt.write("\n".join(list(set(map(str,map(float,range(0, N_PPHMMs))))-set(SeenProtIDList))))
			
			print "\t\t\tCheck if there are alignments to be merged"
			#-------------------------------------------------------------------------------
			with open(PPHMMClustersFile, 'r') as PPHMMClusters_txt:
				N_PPHMMs_AfterMerging = len(PPHMMClusters_txt.readlines())
			
			if N_PPHMMs_AfterMerging == N_PPHMMs:
				print "\t\t\t\tNo alignments to be merged. Stop alignment merging process"
				_ = subprocess.Popen("rm -rf %s" %hhsearchDir, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
				out, err = _.communicate()
				break
			else:
				print "\t\t\t\tMerge %d alignments to make %d alignments" %(N_PPHMMs, N_PPHMMs_AfterMerging)
			
			print "\t\t\tMerge protein alignments and remake HHsuite PPHMMs"
			#-------------------------------------------------------------------------------
			SelfSimScoreList				= PPHMMSimScoreMat.diagonal()
			PPHMMDissimScoreMat				= 1 - np.transpose(PPHMMSimScoreMat**2/SelfSimScoreList)/SelfSimScoreList
			PPHMMDissimScoreMat[PPHMMDissimScoreMat<0]	= 0
			AfterMergingPPHMM_IndexList			= []
			AfterMergingPPHMM_i				= 1.0
			with open(PPHMMClustersFile, 'r') as PPHMMClusters_txt:
				for PPHMMCluster in PPHMMClusters_txt.readlines():
					PPHMMCluster = map(int, map(float, PPHMMCluster.split("\n")[0].split("\t")))
					AfterMergingPPHMM_IndexList.append(min(PPHMMCluster))
					if len(PPHMMCluster) >= 2:
						PPHMMDissimScoreMat_Subset = PPHMMDissimScoreMat[PPHMMCluster][:,PPHMMCluster]
						PPHMMTreeNewick = DistMat2Tree (DistMat	= PPHMMDissimScoreMat_Subset,
										LeafList= PPHMMCluster,
										Dendrogram_LinkageMethod	= "average")
						PPHMMTreeNewick = Tree(PPHMMTreeNewick)
						_ 		= PPHMMTreeNewick.ladderize()
						PPHMMTreeNewick	= PPHMMTreeNewick.write(format = 9)
						
						while True:
							m = re.search(r"\((\d+),(\d+)\)", PPHMMTreeNewick)
							if not m:
								_ = subprocess.Popen("muscle -in %s -out %s -refine" %(	ClusterFile_i,
															ClusterFile_i),
															stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
								err, out = _.communicate()
								break
							PPHMM_i, PPHMM_j = sorted([int(m.group(1)),int(m.group(2))])
							PPHMMTreeNewick = re.sub(r"\((\d+),(\d+)\)", str(PPHMM_i), PPHMMTreeNewick, count=1)
							
							ClusterFile_i = ClustersDir+"/Cluster_%s.fasta" %PPHMM_i
							ClusterFile_j = ClustersDir+"/Cluster_%s.fasta" %PPHMM_j
							HHsuite_PPHMMFile_j = HHsuite_PPHMMDir+"/PPHMM_%s.hhm" %PPHMM_j
							_ = subprocess.Popen("muscle -profile -in1 %s -in2 %s -out %s -gapopen %s -gapextend %s" %(	ClusterFile_i,
																			ClusterFile_j,
																			ClusterFile_i,
																			MUSCLE_GapOpenCost,
																			MUSCLE_GapExtendCost),
																			stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
							err, out = _.communicate()
							_ = subprocess.Popen("rm %s %s" %(ClusterFile_j, HHsuite_PPHMMFile_j), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
							out, err = _.communicate()
							
							Cluster_MetaDataDict[PPHMM_i]["Cluster"]	= Cluster_MetaDataDict[PPHMM_i]["Cluster"] + Cluster_MetaDataDict[PPHMM_j]["Cluster"]
							Cluster_MetaDataDict[PPHMM_i]["DescList"]	= Cluster_MetaDataDict[PPHMM_i]["DescList"] + Cluster_MetaDataDict[PPHMM_j]["DescList"]
							Cluster_MetaDataDict[PPHMM_i]["TaxoLists"]	= Cluster_MetaDataDict[PPHMM_i]["TaxoLists"] + Cluster_MetaDataDict[PPHMM_j]["TaxoLists"]
							del Cluster_MetaDataDict[PPHMM_j]
						
						HHsuite_PPHMMFile_i = HHsuite_PPHMMDir+"/PPHMM_%s.hhm" %PPHMM_i
						Cluster_MetaDataDict[PPHMM_i]["AlignmentLength"]	= AlignIO.read(ClusterFile_i, "fasta").get_alignment_length()
						_ = subprocess.Popen("hhmake -i %s -o %s -v 0 -seq %s -name Cluster_%s -id 100 -M 50" %(	ClusterFile_i,
																		HHsuite_PPHMMFile_i,
																		len(Cluster_MetaDataDict[PPHMM_i]["Cluster"])+1,
																		PPHMM_i),
																		stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
						out, err = _.communicate()
						if err != "":
							print "Something is wrong with constructing a PPHMM from cluster %s" %PPHMM_i
							print "#"*50+"out"+"#"*50
							print out
							print "#"*50+"err"+"#"*50
							print err
							print "_"*100
							while True:
								Input = raw_input_with_timeout(prompt = "Would you like to continue? [Y/N]: ", timelimit = 5, default_input = "Y")
								if Input == "N" or Input == "n":
									raise SystemExit("GRAViTy terminated.")
								elif Input == "Y" or Input == "y":
									print "Continue GRAViTy."
									break
								else:
									print "Input can only be 'Y' or 'N'."
						
					elif len(PPHMMCluster) == 1:
						pass
					
					#Progress bar
					sys.stdout.write("\033[K" + "Merge alignments and make new PPHMMs: [%-20s] %d/%d PPHHMs" % ('='*int(AfterMergingPPHMM_i/N_PPHMMs_AfterMerging*20), AfterMergingPPHMM_i, N_PPHMMs_AfterMerging) + "\r")
					sys.stdout.flush()
					AfterMergingPPHMM_i = AfterMergingPPHMM_i + 1
			
			sys.stdout.write("\033[K")
			sys.stdout.flush()
			
			print "\t\t\tRename protein alignments and their associated PPHMMs"
			#-------------------------------------------------------------------------------
			AfterMergingPPHMM_IndexList	= sorted(AfterMergingPPHMM_IndexList)
			AfterMergingPPHMM_i		= 0
			for PPHMM_i in AfterMergingPPHMM_IndexList:
				Cluster_MetaDataDict[AfterMergingPPHMM_i] = Cluster_MetaDataDict.pop(PPHMM_i)
				
				ClusterFile_i = ClustersDir+"/Cluster_%s.fasta" %PPHMM_i
				ClusterFile_j = ClustersDir+"/Cluster_%s.fasta" %AfterMergingPPHMM_i
				_ = subprocess.Popen("mv %s %s" %(ClusterFile_i, ClusterFile_j), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
				out, err = _.communicate()
				
				HHsuite_PPHMMFile_i = HHsuite_PPHMMDir+"/PPHMM_%s.hhm" %PPHMM_i
				HHsuite_PPHMMFile_j = HHsuite_PPHMMDir+"/PPHMM_%s.hhm" %AfterMergingPPHMM_i
				_ = subprocess.Popen("mv %s %s" %(HHsuite_PPHMMFile_i, HHsuite_PPHMMFile_j), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
				out, err = _.communicate()
				
				with open(HHsuite_PPHMMFile_j, "r+") as HHsuite_PPHMM_txt:
					contents = HHsuite_PPHMM_txt.readlines()
					contents[1] = "NAME  Cluster_%s\n" %AfterMergingPPHMM_i
					contents = "".join(contents)
					HHsuite_PPHMM_txt.seek(0)		#Put cursor at the beginning of the file
					HHsuite_PPHMM_txt.write(contents)	#Write the contents
					HHsuite_PPHMM_txt.truncate()		#Delete everything after the cursor
				
				AfterMergingPPHMM_i = AfterMergingPPHMM_i + 1
			
			print "\t\t\tRebuild the HHsuite PPHMM database\n"
			#-------------------------------------------------------------------------------
			_ = subprocess.Popen("rm %s_hhm.ffdata %s_hhm.ffindex" %(HHsuite_PPHMMDB, HHsuite_PPHMMDB), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
			out, err = _.communicate()
			
			if list(set(map(lambda f: f.split(".")[-1], os.listdir(HHsuite_PPHMMDir))))!=["hhm"]:
				print "There are some other files/folders other than HHsuite PPHMMs in the folder %s. Remove them first." %HHsuite_PPHMMDir
			
			_ = subprocess.Popen("ffindex_build -s %s_hhm.ffdata %s_hhm.ffindex %s" %(HHsuite_PPHMMDB, HHsuite_PPHMMDB, HHsuite_PPHMMDir), stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
			out, err = _.communicate()
			
			_ = subprocess.Popen("rm -rf %s" %hhsearchDir, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
			out, err = _.communicate()
			
			AlignmentMerging_i_round = AlignmentMerging_i_round + 1
		
		print "\tAlignment merging is done. Delete HHsuite shelve directory"
		#-------------------------------------------------------------------------------
		_ = subprocess.Popen("rm -rf %s" %HHsuiteDir, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
		out, err = _.communicate()
	
	################################################################################
	print "- Make HMMER PPHMMDB and its summary file"
	################################################################################
	(ClusterIDList,
	ClusterDescList,
	ClusterSizeList,
	ClusterProtSeqIDList,
	ClusterSizeByTaxoGroupingList,
	ClusterSizeByProtList) = Make_HMMER_PPHMM_DB(	HMMER_PPHMMDir = HMMER_PPHMMDir,
							HMMER_PPHMMDB = HMMER_PPHMMDB,
							ClustersDir = ClustersDir,
							Cluster_MetaDataDict = Cluster_MetaDataDict)
	'''
	if IncludeIncompleteGenomes == True:
		################################################################################
		print "- Save variables to PPHMMDBConstruction.AllGenomes.shelve"
		################################################################################
		VariableShelveFile = VariableShelveDir+"/PPHMMDBConstruction.AllGenomes.shelve"
	elif IncludeIncompleteGenomes == False:
		################################################################################
		print "- Save variables to PPHMMDBConstruction.CompleteGenomes.shelve"
		################################################################################
		VariableShelveFile = VariableShelveDir+"/PPHMMDBConstruction.CompleteGenomes.shelve"
	'''
	VariableShelveFile = VariableShelveDir+"/PPHMMDBConstruction.shelve"
	Parameters = shelve.open(VariableShelveFile,"n")
	for key in [	"ClusterIDList",
			"ClusterDescList",
			"ClusterSizeList",
			"ClusterProtSeqIDList",
			"ClusterSizeByTaxoGroupingList",
			"ClusterSizeByProtList",
			]:
		try:
			Parameters[key] = locals()[key]
			print "\t"+key
		except TypeError:
			pass
	
	Parameters.close()
def write_AlignIO_dna():
    """Convert opuntia.aln to a phylip file"""
    assert 1 == AlignIO.convert("Clustalw/opuntia.aln", "clustal",
                                "Phylip/opuntia.phy", "phylip")
taxa = args.taxa
taxa = taxa.split(",")
numfiles = len(glob.glob(files))
numtaxa = len(taxa)

line1 = '{0} fasta files and {1} taxa found, alignments will be concatenated and written to {2}\n'.format(
    numfiles, numtaxa, outfile)
print(line1)

if numfiles > 0:
    cataln = MultipleSeqAlignment([])
    for taxon in taxa:
        cataln.add_sequence(taxon, "")  #	make alignment with all required taxa
    for fasta in glob.glob(files):
        fastaname = fasta.split('/')[-1]  #	get fasta name without path
        aln = AlignIO.read(fasta, "fasta")  #	extract alignment from fasta
        seqLen = aln.get_alignment_length()
        newaln = MultipleSeqAlignment([])
        seq = "X"
        for catrec in cataln:  # for each taxon

            catid = str(catrec.id)
            for rec in aln:
                if str(
                        rec.id
                ) == catid:  # find sequence in fasta alignment if it's there
                    seq = str(rec.seq)
            if seq == "X":
                seq = ("N" * seqLen
                       )  # if not make a sequence of Ns of the correct length
            catseq = str(