Example #1
0
def _ID(a):

    r"""The function saves ids in a list.
    
    Arguments:
    -a- alignment file
    
    Example:
    
    >>> with open("alignment.fasta", "w") as alignment_file:
    ...    alignment_file.write(">ENSG0997"+"\n"+"TGA"+"\n"+">ENSG1233"+"\n"+"AAA")  
    >>> import os
    >>> from Bio import AlignIO
    >>> ID = _ID("alignment.fasta")
    >>> ID
    ['ENSG0997','ENSG1233']"""
        
    fileName, fileExtension = os.path.splitext(a)
    
    if fileExtension == ".phylip":
        try:
            l = list(AlignIO.read(a,"phylip"))
        except (ValueError):
            l = list(AlignIO.read(a,"phylip-relaxed"))
        except:
            pass
    elif fileExtension == ".fasta":
        l = list(AlignIO.read(a,"fasta"))
    else:
        raise Exception("Wrong format. Choose accepted format.")
        
    ID = [str(l[j].id) for j in range(0,len(l))]
    return(ID)
def fas_to_nex(infile,outfile,protein=True):
	'''Convert fasta infile to nexus and write to outfile. Uses BioPython'''
	if protein:
		aln = AlignIO.read(infile,'fasta',alphabet=Gapped(IUPAC.extended_protein))
	else:
		aln = AlignIO.read(infile,'fasta',alphabet=Gapped(IUPAC.unambiguous_dna))
	AlignIO.write(aln,outfile,'nexus')
Example #3
0
def main_origin(args, stdout, stderr) :
    if not os.path.isdir(args.geneTable) :
        stderr.write("Loading gene table\n")
        geneTable = pygenes.GeneTable()
        geneTable.loadTable(args.geneTable)
        for inputFile in args.alnFiles :
            aln = AlignIO.read(inputFile, "fasta")
            origins = collections.defaultdict(lambda : [])
            for seq in aln :
                origins[geneTable.geneId(seq.description).recordId].append(seq.description)
            multipleOrigins = [(x,y) for (x,y) in origins.items() if len(y) > 1]
            for (x,y) in multipleOrigins :
                stdout.write(inputFile + "\t" + str(x) + "\t" + str(len(y)) + "\t" +
                             ";".join(y) + "\n")
    else :
        n = str(len(args.alnFiles))
        for (i, inputFile) in enumerate(args.alnFiles) :
            stderr.write("Processing file " + str(i+1) + "/" + n + "\n")
            geneTableFile = os.path.join(args.geneTable, os.path.basename(inputFile) + ".geneTable")
            geneTable = pygenes.GeneTable()
            geneTable.loadTable(geneTableFile)
            aln = AlignIO.read(inputFile, "fasta")
            origins = collections.defaultdict(lambda : [])
            for seq in aln :
                origins[geneTable.geneId(seq.description).recordId].append(seq.description)
            multipleOrigins = [(x,y) for (x,y) in origins.items() if len(y) > 1]
            for (x,y) in multipleOrigins :
                stdout.write(inputFile + "\t" + str(x) + "\t" + str(len(y)) + "\t" +
                             ";".join(y) + "\n")
	def get_second_seq(self):
		
		start_buf, end_buf = self.second_seq_text_buffer.get_bounds()
		seq_direct = self.second_seq_text_buffer.get_text(start_buf, end_buf, False).upper()

		seq_file = self.second_seq_file_entry.get_text()
		seq_online = self.second_seq_online_entry.get_text()

		if (seq_direct):
			self.second_seq = seq_direct

		if (seq_file):
			align = AlignIO.read(seq_file, self.file_type)
			self.second_seq = str(align[0].seq)


		if (seq_online):
			if self.file_type == 'fasta':
				f_type = 'fasta'
			elif self.file_type == 'genbank':
				f_type = 'gb'

			url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id='+seq_online+'&rettype='+f_type+''
			f = urllib.request.urlopen(url)
			result = f.read().decode('utf-8')
			if len(result) < 10:
				self.show_message("Error", "Failed to retrieve the second sequence (please check your ID Number)")
				return
			else:
				file_name = "cache/"+seq_online+"."+self.file_type+""
				file = open(file_name, "w")
				file.write(result)
				file.close()
				align = AlignIO.read(file_name, self.file_type)
				self.second_seq = str(align[0].seq)
Example #5
0
def concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8, file9, file10):

    sequences1 = AlignIO.read(file1, 'fasta')
    sequences2 = AlignIO.read(file2, 'fasta')
    sequences3 = AlignIO.read(file3, 'fasta')
    sequences4 = AlignIO.read(file4, 'fasta')
    sequences5 = AlignIO.read(file5, 'fasta')
    sequences6 = AlignIO.read(file6, 'fasta')
    sequences7 = AlignIO.read(file7, 'fasta')
    sequences8 = AlignIO.read(file8, 'fasta')
    sequences9 = AlignIO.read(file9, 'fasta')
    sequences10 = AlignIO.read(file10, 'fasta')

    complete_sequences = []

    for sequence1 in sequences1:
        strain_name = util.get_strain_name(sequence1)
        sequence2 = util.get_matching_sequence(sequences2, strain_name=strain_name)
        sequence3 = util.get_matching_sequence(sequences3, strain_name=strain_name)
        sequence4 = util.get_matching_sequence(sequences4, strain_name=strain_name)
        sequence5 = util.get_matching_sequence(sequences5, strain_name=strain_name)
        sequence6 = util.get_matching_sequence(sequences6, strain_name=strain_name)
        sequence7 = util.get_matching_sequence(sequences7, strain_name=strain_name)
        sequence8 = util.get_matching_sequence(sequences8, strain_name=strain_name)
        sequence9 = util.get_matching_sequence(sequences9, strain_name=strain_name)
        sequence10 = util.get_matching_sequence(sequences10, strain_name=strain_name)

        if (sequence2 and sequence3 and sequence4 and sequence5 and sequence6 and sequence7 and sequence8 and sequence9 and sequence10):
            complete_sequence=[]
            complete_sequence.append(util.get_strain_name(sequence1))
            complete_sequence.append(sequence1.seq+sequence2.seq+sequence3.seq+sequence4.seq+sequence5.seq+sequence6.seq+sequence7.seq+sequence8.seq+sequence9.seq+sequence10.seq)
            complete_sequences.append(complete_sequence)

    return complete_sequences
Example #6
0
def main():
    """
    The main function
    """
    parser = cmdline_parser()
    args = parser.parse_args()

    #read in the alignment
    aln_viral = AlignIO.read(open(args.viral, 'r'), "fasta")
    aln_gta = AlignIO.read(open(args.gta, 'r'), "fasta")
    aln_viral_stereo = stereo_score(aln_viral)
    aln_gta_stereo = stereo_score(aln_gta)

    #convert to tuples
    viral_stereo_posi = [(i, j) for i, j in enumerate(aln_viral_stereo)]
    gta_stereo_posi = [(i, j) for i, j in enumerate(aln_viral_stereo)]

    #convert to pandas dataframe
    viral_stereo_posi_df = pd.DataFrame.from_records(viral_stereo_posi)
    gta_stereo_posi_df = pd.DataFrame.from_records(gta_stereo_posi)

    #add column headers
    viral_stereo_posi_df.columns = ['position', 'score']
    gta_stereo_posi_df.columns = ['position', 'score']

    #plot the figures
    fig = plt.figure(figsize=(30, 10))
    plt.bar(viral_stereo_posi_df['position'], viral_stereo_posi_df['score'])
    plt.bar(ta_stereo_posi_df['position'], -gta_stereo_posi_df['score'], color='r')
    plt.axis([0, 700, -1.2, 1.2])
    savefig('args.output')
Example #7
0
def load_tree(seqfname):
    """Load an alignment, build & prep a tree, return the tree object."""
    if seqfname.endswith('.aln'):
        aln = AlignIO.read(seqfname, 'clustal')
    elif seqfname.endswith('.fasta'):
        # Run MAFFT quickly
        alndata = subprocess.check_output(['mafft', '--quiet', '--auto',
                                           seqfname])
        aln = AlignIO.read(StringIO(alndata), 'fasta')
    else:
        raise ValueError("Input sequences must be a Clustal alignment (.aln) "
                         "or unaligned FASTA (.fasta)")

    # Use conserved (less-gappy) blocks to build the tree
    aln = alnutils.blocks(aln, 0.4)
    with tempfile.NamedTemporaryFile(mode='w') as tmp:
        AlignIO.write(aln, tmp, 'fasta')
        tmp.flush()
        treedata = subprocess.check_output(['fasttree',
                                            '-pseudo', '-gamma', '-wag',
                                            tmp.name])
    tree = Phylo.read(StringIO(treedata), 'newick')

    # Collapse weakly supported splits
    confs = [c.confidence
             for c in tree.find_clades()
             if c.confidence is not None]
    # ENH: accept min_confidence as an option
    min_confidence = math.fsum(confs) / len(confs)
    tree.collapse_all(lambda c: c.confidence < min_confidence)
    tree.ladderize(reverse=True)
    tree.root.branch_length = 0.0
    return tree
Example #8
0
def design_primers(source_dir, target_dir, settings, logfile):
    print("\nDesigning primers using PriFi...\n", file=logfile)
    # get rid of previous files
    utils.purge_dir(target_dir)
    aln_files = glob(os.path.join(source_dir, '*.fasta'))
    print("\tChecking for empty alignments...", file=logfile)
    for f in aln_files:
        try:
            align = AlignIO.read(f, 'fasta')
            filename = os.path.basename(f)
            shutil.copyfile(f, os.path.join(target_dir, filename))
        except Exception:
            print("[WARNING] Empty alignment file?! (%s)" % f, file=logfile)
            continue

    # call PriFi for actual primer design
    for f in glob(os.path.join(target_dir, '*.fasta')):
        aln = AlignIO.read(f, 'fasta')
        summary = AlignInfo.SummaryInfo(aln)
        l = aln.get_alignment_length()
        primerpairs = prifipy.findprimers(0, list(aln), summary, l, settings, logfile)
        if not primerpairs:
            print("%s: No valid primer pair found" % f, file=logfile)
        else:
            print('%s: Found %d primer pair suggestions. Writing primer files:' % (f, len(primerpairs)), file=logfile)
            prifipy.writePrimersToFiles(f, primerpairs, 1, logfile)
Example #9
0
 def conversion(self, prank_number, prank_ext, format):
     """Get PRANK to do a conversion, and check it with SeqIO."""
     filename = "%s.%s" % (self.output, prank_ext)
     if os.path.isfile(filename):
         os.remove(filename)
     cmdline = PrankCommandline(prank_exe, d=self.input, convert=True, f=prank_number, o='"%s"' % self.output)
     self.assertEqual(
         str(cmdline),
         prank_exe + " -d=%s" % self.input + ' -o="%s"' % self.output + " -f=%i" % prank_number + " -convert",
     )
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(
         str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform != "win32")
     )
     return_code = child.wait()
     self.assertEqual(return_code, 0)
     message = child.stdout.read().strip()
     self.assert_(("PRANK: converting '%s' to '%s'" % (self.input, filename)) in message, message)
     self.assertEqual(child.stderr.read(), "")
     self.assert_(os.path.isfile(filename))
     old = AlignIO.read(open(self.input), "fasta")
     # Hack...
     if format == "phylip":
         for record in old:
             record.id = record.id[:10]
     new = AlignIO.read(open(filename), format)
     assert len(old) == len(new)
     for old_r, new_r in zip(old, new):
         self.assertEqual(old_r.id, new_r.id)
         self.assertEqual(str(old_r.seq), str(new_r.seq))
     os.remove(filename)
     del child
Example #10
0
 def conversion(self, prank_number, prank_ext, format) :
     """Get PRANK to do a conversion, and check it with SeqIO."""
     filename = "%s.%s" % (self.output, prank_ext)
     if os.path.isfile(filename) :
         os.remove(filename)
     cmdline = PrankCommandline(prank_exe, d=self.input,
                                convert=True, f=prank_number,
                                o='"%s"' % self.output)
     self.assertEqual(str(cmdline), prank_exe \
                      + ' -d=%s' % self.input \
                      + ' -o="%s"' % self.output \
                      + ' -f=%i' % prank_number \
                      + ' -convert')
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     result, stdout, stderr = Application.generic_run(cmdline)
     self.assertEqual(result.return_code, 0)
     message = stdout.read().strip()
     self.assert_(("PRANK: converting '%s' to '%s'" % (self.input, filename)) \
                  in message, message)
     self.assertEqual(stderr.read(), "")
     self.assertEqual(str(result._cl), str(cmdline))
     self.assert_(os.path.isfile(filename))
     old = AlignIO.read(open(self.input), "fasta")
     #Hack...
     if format=="phylip" :
         for record in old :
             record.id = record.id[:10]
     new = AlignIO.read(open(filename), format)
     assert len(old) == len(new)
     for old_r, new_r in zip(old, new) :
         self.assertEqual(old_r.id, new_r.id)
         self.assertEqual(str(old_r.seq), str(new_r.seq))
     os.remove(filename)
Example #11
0
def multiple_alignment(fasta_dict, alignment_type=SeqTypeData().TYPE_DEFAULT):
    in_handle = StringIO()
    fasta_tools.write_fasta_handle(in_handle, fasta_dict)

    muscle_cmd = SeqTypeData().type2cmd[alignment_type]
    child = subprocess.Popen(str(muscle_cmd), stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                             shell=(sys.platform != "win32"))
    if not child:
        print("Process was not created!")
        return

    if sys.version_info[0] == 3:
        child.stdin.write(bytes(in_handle.getvalue(), 'utf-8'))
        child.stdin.close()
        align = AlignIO.read(StringIO("".join(line.decode() for line in child.stdout)), "clustal")
    else:
        child.stdin.write(in_handle.getvalue())
        child.stdin.close()
        align = AlignIO.read(child.stdout, "clustal")
    fd = copy.deepcopy(fasta_dict)
    for a in align:
        fd.set(a.id, str(a.seq))

    return fd
Example #12
0
 def conversion(self, prank_number, prank_ext, format):
     """Get PRANK to do a conversion, and check it with SeqIO."""
     filename = "%s.%s" % (self.output, prank_ext)
     if os.path.isfile(filename):
         os.remove(filename)
     cmdline = PrankCommandline(prank_exe, d=self.input,
                                convert=True, f=prank_number,
                                o='"%s"' % self.output)
     self.assertEqual(str(cmdline), _escape_filename(prank_exe)
                      + ' -d=%s' % self.input
                      + ' -o="%s"' % self.output
                      + ' -f=%i' % prank_number
                      + ' -convert')
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     message, error = cmdline()
     self.assertTrue("PRANK" in message, message)
     self.assertTrue(("converting '%s' to '%s'" % (self.input, filename))
                     in message, message)
     self.assertEqual(error, "")
     self.assertTrue(os.path.isfile(filename))
     old = AlignIO.read(self.input, "fasta")
     # Hack...
     if format == "phylip":
         for record in old:
             record.id = record.id[:10]
     new = AlignIO.read(filename, format)
     self.assertEqual(len(old), len(new))
     for old_r, new_r in zip(old, new):
         self.assertEqual(old_r.id, new_r.id)
         self.assertEqual(str(old_r.seq), str(new_r.seq))
     os.remove(filename)
Example #13
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == 'index':
             nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = dict((i.split()[0], i.split()[1]) for i in handle)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
         nucl.close()  # Close the indexed FASTA file
     self.alns = alns
Example #14
0
File: ks.py Project: ascendo/jcvi
def muscle_align_protein(recs, work_dir, outfmt="fasta", inputorder=True):
    """
    Align given proteins with muscle.
    recs are iterable of Biopython SeqIO objects
    """
    fasta_file = op.join(work_dir, "prot-start.fasta")
    align_file = op.join(work_dir, "prot.aln")
    SeqIO.write(recs, file(fasta_file, "w"), "fasta")

    muscle_cl = MuscleCommandline(cmd=MUSCLE_BIN("muscle"),
            input=fasta_file, out=align_file, seqtype="protein",
            clwstrict=True)
    stdout, stderr = muscle_cl()
    alignment = AlignIO.read(muscle_cl.out, "clustal")

    if inputorder:
        try:
            muscle_inputorder(muscle_cl.input, muscle_cl.out)
        except ValueError:
            return ""
        alignment = AlignIO.read(muscle_cl.out, "fasta")

    print >>sys.stderr, "\tDoing muscle alignment: %s" % muscle_cl
    if outfmt == "fasta":
        return alignment.format("fasta")
    if outfmt == "clustal":
        return alignment.format("clustal")
Example #15
0
def _input(a):

    """The function converts alignments to matrix for further use.
  
    Arguments:
    -a- alignment file
    
    Example:
    >>>import os
    >>>import numpy as np
    >>>from Bio import AlignIO
    >>>_input("example.fasta")"""
        
    fileName, fileExtension = os.path.splitext(a)
    
    if fileExtension == ".phylip":
        try:
            l = list(AlignIO.read(a,"phylip"))
        except (ValueError):
            l = list(AlignIO.read(a,"phylip-relaxed"))
        except:
            pass
    elif fileExtension == ".fasta":
        l = list(AlignIO.read(a,"fasta"))
    else:
        raise Exception("Wrong format. Choose accepted format.")

    p = [[i for i in str(l[j].seq)] for j in range(0,len(l))]
    y = np.array(p)
    return(y)
Example #16
0
def main(argv):
  usage = 'ConvertAln -i <infile> -x <informat> -o <outfile> -f <outformat>'
  infile = ''
  informat = ''
  outfile = ''
  outformat = ''
  try:
     opts, args = getopt.getopt(argv,"hi:x:o:f:",["infile=", "informat=", "outfile=", "outformat="])
  except getopt.GetoptError:
     sys.exit(usage)
  for opt, arg in opts:
     if opt == '-h':
        print usage
        sys.exit()
     elif opt in ("-i", "--infile"):
        infile = arg
     elif opt in ("-x", "--informat"):
        informat = arg
     elif opt in ("-o", "--outfile"):
        outfile = arg
     elif opt in ("-f", "--outformat"):
        outformat = arg
  if not infile:
    sys.exit("must specify infile! %s" % usage)
  if not outformat:
    sys.exit("must specify format to convert to! %s" % usage)
  
  if not informat:
    informat = guess_format(infile)

  if not outfile:
    if '.' in infile:
      outfile = '.'.join((infile.split('.')[:-1] + [get_extension(outformat)]))
    else:
      outfile = '.'.join((infile, get_extension(outformat)))
  if infile == 'pipe' or infile == 'stdin' or infile == 'STDIN' or infile == '|':
    infile = sys.stdin    
  if outformat == 'phylip':
    alignment=AlignIO.read(infile, informat, alphabet=IUPAC.ambiguous_dna)
    alignment = remove_blank(alignment)
    if len(alignment) == 0 or len(alignment[0]) == 0:
      sys.exit()
    if outfile == 'pipe' or outfile == 'stdout' or outfile == 'STDOUT' or outfile == '|' or outfile == '>':
      write_phylip(alignment, sys.stdout)
    else:
      out_fh = open(outfile, 'w')
      write_phylip(alignment, out_fh)
      out_fh.close()

  else:
    if outfile == 'pipe' or outfile == 'stdout' or outfile == 'STDOUT' or outfile == '|' or outfile == '>':
      outfile = sys.stdout
    if outformat == 'nexus':
      alignment=AlignIO.read(infile, informat, alphabet=IUPAC.ambiguous_dna)
      write_nexus(alignment, outfile)
    else:
      AlignIO.convert(infile, informat, outfile, outformat, alphabet=IUPAC.ambiguous_dna)
Example #17
0
    def codon_align(self, alignment_tool="mafft", prune=True, verbose=0):
        ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps
        note that this suppresses any compensated frameshift mutations

        Parameters:
        - alignment_tool: ['mafft', 'muscle'] the commandline tool to use
        '''
        from Bio import AlignIO,SeqIO
        from Bio.SeqRecord import SeqRecord
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        # translage
        aa_seqs = {}
        bad_seq = 0
        for seq in self.seqs.values():
            tempseq = seq.seq.translate()
            # use only sequences that translate with out trouble
            if '*' not in str(tempseq)[:-1] or prune==False:
                aa_seqs[seq.id]=SeqRecord(tempseq,id=seq.id)
                aa_seqs[seq.id].attributes = seq.attributes
            else:
                if verbose: print(seq.id,"has premature stops, discarding")
            bad_seq+='*' in str(tempseq)[:-1]

        print('Number of sequences with stops:',bad_seq,'out of total',len(self.seqs))
        tmpfname = 'temp_in.fasta'
        SeqIO.write(aa_seqs.values(), tmpfname,'fasta')

        if alignment_tool=='muscle':
            from Bio.Align.Applications import MuscleCommandline
            cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5]+'aligned.fasta')
            cline()
            aln_aa = AlignIO.read(tmpfname[:-5]+'aligned.fasta', "fasta")
        elif alignment_tool=='mafft':
            from Bio.Align.Applications import MafftCommandline
            from StringIO import StringIO
            mafft_cline = MafftCommandline(input=tmpfname)
            stdout, stderr = mafft_cline()
            aln_aa = AlignIO.read(StringIO(stdout), "fasta")
        else:
            print('Alignment tool not supported:',alignment_tool)
            return

        #generate nucleotide alignment
        self.aln = pad_nucleotide_sequences(aln_aa, self.seqs)
        self.sequence_lookup = {seq.id:seq for seq in self.aln}
        self.reference_aligned = self.sequence_lookup[self.reference.id]
        # add attributes to alignment
        for seq in self.seqs.values():
            if seq.id in self.sequence_lookup:
                self.sequence_lookup[seq.id].attributes = seq.attributes
        os.chdir('..')
        remove_dir(self.run_dir)
Example #18
0
 def read_alignment(self, *args, **kwargs):
     filename = args[0]
     args = args[1:]
     with fileIO.freader(filename) as fl:
         if ISPY3:
             handle = io.TextIOWrapper(fl)
             msa = AlignIO.read(handle, *args, **kwargs)
         else:
             msa = AlignIO.read(fl, *args, **kwargs)
     self.infile = filename
     # guess alphabet
     self._msa = self._guess_alphabet(msa)
Example #19
0
def main():
    args = get_parser()
    outfile1 = open(args.outputfile1, "w")
    file1 = glob.glob(os.path.join(args.indirectory, "*.nexus"))
    with open(file1[0], "r") as infile1:
        aln = AlignIO.read(infile1, 'nexus')
        AlignIO.write(aln, outfile1, 'phylip-relaxed')
    outputfile2 = open(args.outputfile2, "w")
    with open(file1[1], "r") as infile2:
        aln = AlignIO.read(infile2, 'nexus')
        AlignIO.write(aln, outputfile2, 'phylip-relaxed')
    outputfile3 = open(args.outputfile3, "w")
    with open(file1[2], "r") as infile3:
        aln = AlignIO.read(infile3, 'nexus')
        AlignIO.write(aln, outputfile3, 'phylip-relaxed')
def main():
    options, args = interface()
    # iterate through all the files to determine the longest alignment
    files = get_files(options.input)
    align_lengths = [AlignIO.read(f, 'nexus').get_alignment_length() \
                            for f in files]
    max_align_length = max(align_lengths)
    # find the middle
    middle = int(round(max_align_length/2, 0))
    # create a dict to hold the results by position in longest array
    differences = dict((d,np.array([])) for d in range(-middle, middle + 1))
    # iterate through all the files again
    for f in files:
        align = AlignIO.read(f, 'nexus')
        align_length = align.get_alignment_length()
        align_diff = int((round((max_align_length - align_length)/2.,0) - middle))
        # determine relative start of this alignment to longest
        for col in xrange(align_length):
            bases = align.get_column(col)
            b_counts = len(set(bases))
            #pdb.set_trace()
            differences[align_diff + col] = np.append(differences[align_diff + col],b_counts)
    #pdb.set_trace()
    position = differences.keys()
    # create bins in groups of 100
    #bins = np.array(range(0,500,100))
    #print bins
    #pdb.set_trace()
    if options.output:
        outp = open(options.output, 'w')
    else:
        outp = sys.stdout
    outp.write('bp, mean, ci, onediff, greaterthanonediff, fourdiff, count\n')
    ignore = []
    for p in sorted(position):
        #pdb.set_trace()
        # how many only have 0-1 difference
        try:
            one_diff = sum(differences[p] <= 1)/float(len(differences[p]))
            four_diff = sum(differences[p] >= 4)/float(len(differences[p]))
            greater_than_one_diff = sum(differences[p] > 1)/float(len(differences[p]))
            total = len(differences[p])
        except ZeroDivisionError:
            ignore.append(p)
        if p not in ignore:
            outp.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(p, np.mean(differences[p]), 1.96 * np.std(differences[p]), one_diff, four_diff, greater_than_one_diff, total))
    if options.output:
        outp.close()
    def standard_test_procedure(self, cline):
        """Standard testing procedure used by all tests."""

        # Overwrite existing files.
        cline.force = True

        # Mark output files for later cleanup.
        self.add_file_to_clean(cline.outfile)
        if cline.guidetree_out:
            self.add_file_to_clean(cline.guidetree_out)

        input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"))
        self.assertEqual(str(eval(repr(cline))), str(cline))
        output, error = cline()
        self.assertTrue(not output or output.strip().startswith("CLUSTAL"))

        # Test if ClustalOmega executed successfully.
        self.assertTrue(error.strip() == "" or
               error.startswith("WARNING: Sequence type is DNA.") or
               error.startswith("WARNING: DNA alignment is still experimental."))

        # Check the output...
        align = AlignIO.read(cline.outfile, "clustal")
        output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal"))
        self.assertEqual(len(set(input_records.keys())), len(set(output_records.keys())))
        for record in align:
            self.assertEqual(str(record.seq), str(output_records[record.id].seq))

        # TODO - Try and parse this with Bio.Nexus?
        if cline.guidetree_out:
            self.assertTrue(os.path.isfile(cline.guidetree_out))
Example #22
0
 def __init__(self, file_name=None, data = None, format='fasta'):
     if file_name:
         super(Alignment, self).__init__(AlignIO.read(file_name, format))
     elif data:
         super(Alignment, self).__init__(AlignIO.parse(StringIO(data), format))
     else:
         super(Alignment, self).__init__([])
Example #23
0
 def test_needle_file(self):
     """needle with the asis trick, output to a file."""
     # Setup,
     cline = NeedleCommandline(cmd=exes["needle"])
     cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT")
     cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT")
     cline.set_parameter("-gapopen", "10")
     cline.set_parameter("-gapextend", "0.5")
     # EMBOSS would guess this, but let's be explicit:
     cline.set_parameter("-snucleotide", "True")
     cline.set_parameter("-outfile", "Emboss/temp with space.needle")
     self.assertEqual(str(eval(repr(cline))), str(cline))
     # Run the tool,
     stdout, stderr = cline()
     # Check it worked,
     self.assertTrue(stderr.strip().startswith("Needleman-Wunsch global alignment"), stderr)
     self.assertEqual(stdout.strip(), "")
     filename = cline.outfile
     self.assertTrue(os.path.isfile(filename),
                     "Missing output file %r from:\n%s" % (filename, cline))
     # Check we can parse the output...
     align = AlignIO.read(filename, "emboss")
     self.assertEqual(len(align), 2)
     self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT")
     self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT")
     # Clean up,
     os.remove(filename)
Example #24
0
 def setUp(self):
     nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
     prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein)
     with open(TEST_ALIGN_FILE6[0][2]) as handle:
         id_corr = dict((i.split()[0], i.split()[1]) for i in handle)
     aln = codonalign.build(prot, nucl, corr_dict=id_corr, alphabet=codonalign.default_codon_alphabet)
     self.aln = aln
    def standard_test_procedure(self, cline):
        """Standard testing procedure used by all tests."""
        self.assertTrue(str(eval(repr(cline))) == str(cline))
        input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"),
                                      lambda rec : rec.id.replace(":", "_"))

        #Determine name of tree file
        if cline.newtree:
            tree_file = cline.newtree
        else:
            #Clustalw will name it based on the input file
            tree_file = os.path.splitext(cline.infile)[0] + ".dnd"

        # Mark generated files for later removal
        self.add_file_to_clean(cline.outfile)
        self.add_file_to_clean(tree_file)

        output, error = cline()
        self.assertTrue(output.strip().startswith("CLUSTAL"))
        self.assertTrue(error.strip() == "")

        #Check the output...
        align = AlignIO.read(cline.outfile, "clustal")
        #The length of the alignment will depend on the version of clustalw
        #(clustalw 2.1 and clustalw 1.83 are certainly different).
        output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile,"clustal"))
        self.assertTrue(set(input_records.keys()) == set(output_records.keys()))
        for record in align:
            self.assertTrue(str(record.seq) == str(output_records[record.id].seq))
            self.assertTrue(str(record.seq).replace("-", "") ==
                   str(input_records[record.id].seq))

        #Check the DND file was created.
        #TODO - Try and parse this with Bio.Nexus?
        self.assertTrue(os.path.isfile(tree_file))
Example #26
0
 def align(cls, seq_records, outfile=None):
     '''Align given sequences
     @param seq_records: a list of SeqRecords objects
     @param outfile: a filename for the output alignment or None
     @return: if the outfile is none, return an AlignmentExt object;
     otherwise return True on success. In both cases return None on error.'''
     if not outfile: 
         outfile = mktmp_name('.aln.fasta')
         remove_out = True
     else: remove_out = False
     msafile = mktmp_fasta(seq_records)
     args = dict(thread=cpu_count, input=msafile)
     if len(seq_records) < 10000: 
         args['auto'] = True
     else: 
         args['parttree'] = True
         args['partsize'] = 1000
     ali = None
     if run_cline(MafftCommandline(**args), stdout=outfile):
         if remove_out:
             ali = AlignmentExt.from_msa(AlignIO.read(outfile, 'fasta'))
         else: ali = True
     if remove_out: safe_unlink(outfile)
     safe_unlink(msafile)
     return ali
Example #27
0
    def __init__(self, fileName = None, msaBean = None, alphabet = "ACDEFGHIKLMNPQRSTVWY-", backtrack = None, jsBeanFile = None, id=None):
        self.alphabet = alphabet
        self.backtrack = backtrack
        self.id = id if id else "id_default_msa"
        if fileName:
            fType = None
            if fileName.endswith(".aln"): fType = "clustal"
            else : fType = "fasta"
            self.alignment = AlignIO.read(open(fileName), fType) # fasta
            self.asMatrix = [[ aaCoherce(aa) for aa in list(record.seq) ] for record in self.alignment]

            self.headers = [record.id for record in self.alignment]
           # print dir(self.alignment)
           # print dir (self.asMatrix)
           # print self.alignment.annotations
        elif msaBean:
            self.asMatrix = msaBean.matrix
            self.headers = msaBean.header
            self.backtrack = msaBean.backtrack
            #self.alignment = [seq for seq in msaBean.matrix]
            # same for headers
        elif jsBeanFile:
            with open(jsBeanFile) as json_file:
                json_data = json.load(json_file)
            self.asMatrix = json_data['matrix']
            self.headers = json_data['headers']
            self.backtrack = json_data['backtrack']

        else:
            raise initError("You must specify a bean or a mfasta file")

        self.nSeq = len(self.asMatrix)
        self.length = len(self.asMatrix[0])
        self._frequency = None
Example #28
0
 def test_long(self):
     """Simple muscle call using long file."""
     #Create a large input file by converting some of another example file
     temp_large_fasta_file = "temp_cw_prot.fasta"
     handle = open(temp_large_fasta_file, "w")
     records = list(SeqIO.parse(open("NBRF/Cw_prot.pir", "rU"), "pir"))[:40]
     SeqIO.write(records, handle, "fasta")
     handle.close()
     #Prepare the command...
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("in", temp_large_fasta_file)
     #Preserve input record order
     cmdline.set_parameter("stable", True) #Default None treated as False!
     #Use fast options
     cmdline.set_parameter("maxiters", 1)
     cmdline.set_parameter("diags", True) #Default None treated as False!
     #Use clustal output
     cmdline.set_parameter("clwstrict", True) #Default None treated as False!
     #Shoudn't need this, but just to make sure it is accepted
     cmdline.set_parameter("maxhours", 0.1)
     #No progress reports to stderr
     cmdline.set_parameter("quiet", True) #Default None treated as False!
     self.assertEqual(str(cmdline).rstrip(), muscle_exe + \
                      " -in temp_cw_prot.fasta -diags -maxhours 0.1" + \
                      " -maxiters 1 -clwstrict -stable -quiet")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     result, out_handle, err_handle = generic_run(cmdline)
     align = AlignIO.read(out_handle, "clustal")
     self.assertEqual(len(records), len(align))
     for old, new in zip(records, align):
         self.assertEqual(old.id, new.id)
         self.assertEqual(str(new.seq).replace("-",""), str(old.seq))
     os.remove(temp_large_fasta_file)
     #See if quiet worked:
     self.assertEqual("", err_handle.read().strip())
Example #29
0
def generate_mask(refpkg, stockholm_alignment):
    """
    Generate an AlignmentMask from a reference package and stockholm alignment
    """
    with refpkg.open_resource('mask') as fp:
        unmasked_positions = set(int(i.strip())
                                     for i in fp.read().split(','))

    # Get length of alignment
    with open(stockholm_alignment) as fp:
        align_length = len(AlignIO.read(stockholm_alignment, 'stockholm')[0])

    # Load consensus columns
    with open(stockholm_alignment) as fp:
        consensus_columns = _parse_stockholm_consensus(fp)

    if not align_length == len(consensus_columns.mask):
        raise ValueError("Consensus Columns and Alignment have "
                "differing lengths")

    counter = itertools.count().next
    consensus_column_indexes = (counter() if i else None
                                for i in consensus_columns.mask)
    consensus_mask = AlignmentMask([i in unmasked_positions for i
                                    in consensus_column_indexes])
    return consensus_mask
Example #30
0
 def test_needle_piped(self):
     """needle with asis trick, output piped to stdout."""
     cline = NeedleCommandline(cmd=exes["needle"],
                              asequence="asis:ACCCGGGCGCGGT",
                              bsequence="asis:ACCCGAGCGCGGT",
                              gapopen=10,
                              gapextend=0.5,
                              auto=True, filter=True)
     self.assertEqual(str(cline),
                      exes["needle"] + " -auto -filter"
                      + " -asequence=asis:ACCCGGGCGCGGT"
                      + " -bsequence=asis:ACCCGAGCGCGGT"
                      + " -gapopen=10 -gapextend=0.5")
     # Run the tool,
     child = subprocess.Popen(str(cline),
                              stdin=subprocess.PIPE,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              universal_newlines=True,
                              shell=(sys.platform != "win32"))
     child.stdin.close()
     # Check we could read it's output
     align = AlignIO.read(child.stdout, "emboss")
     self.assertEqual(len(align), 2)
     self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT")
     self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT")
     # Check no error output:
     self.assertEqual(child.stderr.read(), "")
     self.assertEqual(0, child.wait())
     child.stdout.close()
     child.stderr.close()
Example #31
0
    def test_aln_to_leaves(self):
        anc_t = treeanc.TreeAnc.from_file(resources_dir+'PR.B.100.nwk', 'newick')
        aln = AlignIO.read(resources_dir+'PR.B.100.fasta', 'fasta')

        err = anc_t.load_aln(aln)
        assert (err==0) # all sequencs were set up successfully
Example #32
0
        with open("%s.fa" % fa_head, "w") as handle:
            handle.write(">%s\n%s\n>%s\n%s\n" %
                         (germline, germs[germline], entry.id, entry.seq))

        clustal_cline = ClustalwCommandline(cmd=clustalw,
                                            infile="%s.fa" % fa_head)
        try:
            stdout, stderr = clustal_cline()
        except:
            print("Error in alignment of %s (will skip): %s" %
                  (entry.id, stderr))
            for f in glob.glob("%s.*" % fa_head):
                os.remove(f)
            continue

        alignment = AlignIO.read("%s.aln" % fa_head, "clustal")
        shift = False

        for record in alignment:
            codons = re.sub(
                "---", "", str(record.seq.strip("-"))
            )  #don't care about leading/trailing and full-codon indels are fine
            if "-" in codons:
                shift = True  #likely frameshift --discard!

        if not shift:  #made it; save the sequence
            good += 1
            sequences.append(entry)

        for f in glob.glob("%s.*" % fa_head):
            os.remove(f)
Example #33
0
    aln = drbaln
    alnindex = dict([(a.id, a) for a in aln])
    compare_tepitope_alleles(alnindex)
    #d1 = compare(ref1, ref2, alnindex)
    #x = d1.merge(d2,right_index=1,left_index=1)
    #print len(x)
    #compare_ref(hla,bola,ref,alnindex)
    plt.show()
    return


pocket_residues = get_pocket_positions()
librarypssms = get_pssms()
#drb MHC alignment using IPD sequences, includes BoLA-DRB3 sequences
drbaln = AlignIO.read(drb_aln_file, "fasta")


def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("-t",
                      "--test",
                      dest="test",
                      action='store_true',
                      help="test")
    opts, remainder = parser.parse_args()
    if opts.test == True:
        test()

usage = "%prog [options] < fasta_file > phylip file"
description = "Convert a fasta file to a phylip file"
parser = optparse.OptionParser(description=description, usage=usage)
parser.add_option(
    "-c",
    "--convfile",
    help="File to convert new IDs back to original IDs (D = don't save file)",
    action="store",
    type="str",
    dest="convfile",
    default=None)
(options, args) = parser.parse_args()

# Read the FASTA file from stdin and convert it into a phylip file
# Use list so we actually edit in-place rather than
# just editing a copy that gets destroyed later!
aln = list(AlignIO.read(sys.stdin, "fasta"))

if not options.convfile == None:
    fid = open(options.convfile, "w")

# We will use this to convert back to the IDs in the fasta file
for i in range(len(aln)):
    newid = "S%09d" % (i)
    if not options.convfile == None:
        fid.write("%s\t%s\n" % (newid, aln[i].id))
    aln[i].id = newid

SeqIO.write(aln, sys.stdout, "phylip")
Example #35
0
from Bio import AlignIO
import sys
al = AlignIO.read( open(sys.argv[1]), "stockholm" )
print al.format("fasta")
    def Calculate_BaseEdit_freq(self, lQuery_seq=[]):

        dRef = {}
        dResult = {}

        dRef[sBarcode] = (
            sRef)  # total matched reads, insertion, deletion, complex
        dResult[sBarcode] = [0, 0, 0, 0, [], [], [], [], [], [], []]

        # lRef   : [(ref_seq, ref_seq_after_barcode, barcode, barcode end pos, indel end pos, indel from barcode),(...)]
        # dResult = [# of total, # of ins, # of del, # of com, [total FASTQ], [ins FASTQ], [del FASTQ], [com FASTQ], info]
        iCount = 0

        for sQuery_seq_raw in lQuery_seq:

            iBarcode_matched = 0
            iNeedle_matched = 0
            iInsert_count = 0
            iDelete_count = 0
            iComplex_count = 0

            try:
                # Check the barcode pos and remove it.
                sQuery_seq_raw = sQuery_seq_raw.replace('\r', '')
                iBarcode_start_pos = sQuery_seq_raw.index(sBarcode)
                iBarcode_matched += 1

                sQuery_seq_with_barcode = sQuery_seq_raw[iBarcode_start_pos:]

                # _check = 0
                # if sQuery_seq_raw == 'TCTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGTATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCCCAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA':
                #     _check = 1

                sRef_seq = r'<(echo -e ">{name}\n{seq}")'.format(
                    name=sBarcode + '_ref', seq=sRef)
                sQuery_seq = r'<(echo -e ">{name}\n{seq}")'.format(
                    name=sBarcode + '_query', seq=sQuery_seq_with_barcode)

                sNeedle_cmd = r"/bin/bash -c 'needle -filter {0} {1} -outfile stdout -gapopen {2} -gapextend {3} -endweight Y -endopen {4} -endextend {5}'".format(
                    sRef_seq, sQuery_seq, sOG, sOE, sEG, sEE)

                Needle_result = sp.Popen(sNeedle_cmd,
                                         stdout=sp.PIPE,
                                         stderr=sp.PIPE,
                                         universal_newlines=True,
                                         shell=True)

                lResult = [
                    Instance.seq._data for Instance in AlignIO.read(
                        Needle_result.stdout, "emboss")
                ]
                sRef_needle_ori = lResult[0]
                sQuery_needle_ori = lResult[1]

                # if _check == 1:
                #     print(sRef_needle_ori)
                #     print(sQuery_needle_ori)
                #     set_trace()

                Needle_result.stdout.close()

                # detach forward ---, backward ---
                # e.g.    ref   ------AAAGGCTACGATCTGCG------
                #         query AAAAAAAAATCGCTCTCGCTCTCCGATCT
                # trimmed ref         AAAGGCTACGATCTGCG
                # trimmed qeury       AAATCGCTCTCGCTCTC
                iReal_ref_needle_start = 0
                iReal_ref_needle_end = len(sRef_needle_ori)
                iRef_needle_len = len(sRef_needle_ori)

                for i, sRef_nucle in enumerate(sRef_needle_ori):
                    if sRef_nucle in ['A', 'C', 'G', 'T']:
                        iReal_ref_needle_start = i
                        break

                for i, sRef_nucle in enumerate(sRef_needle_ori[::-1]):
                    if sRef_nucle in ['A', 'C', 'G', 'T']:
                        iReal_ref_needle_end = iRef_needle_len - (i + 1)
                        # forward 0 1 2  len : 3
                        # reverse 2 1 0,  len - (2 + 1) = 0
                        break

                sRef_needle = sRef_needle_ori[
                    iReal_ref_needle_start:iReal_ref_needle_end + 1]
                if iReal_ref_needle_start:
                    sQuery_needle = sQuery_needle_ori[:iReal_ref_needle_end]
                sQuery_needle = sQuery_needle_ori[:len(sRef_needle)]
                # detaching completion

                # indel info making.
                iNeedle_match_pos_ref = 0
                iNeedle_match_pos_query = 0
                iNeedle_insertion = 0
                iNeedle_deletion = 0

                lInsertion_in_read = [
                ]  # insertion result [[100, 1], [119, 13]]
                lDeletion_in_read = []  # deletion result  [[97, 1], [102, 3]]

                # print 'sRef_needle', sRef_needle
                # print 'sQuery_needle', sQuery_needle
                for i, (sRef_nucle, sQuery_nucle) in enumerate(
                        zip(sRef_needle, sQuery_needle)):

                    if sRef_nucle == '-':
                        iNeedle_insertion += 1

                    if sQuery_nucle == '-':
                        iNeedle_deletion += 1

                    if sRef_nucle in ['A', 'C', 'G', 'T']:
                        if iNeedle_insertion:
                            lInsertion_in_read.append(
                                [iNeedle_match_pos_ref, iNeedle_insertion])
                            iNeedle_insertion = 0
                        iNeedle_match_pos_ref += 1

                    if sQuery_nucle in ['A', 'C', 'G', 'T']:
                        if iNeedle_deletion:
                            lDeletion_in_read.append(
                                [iNeedle_match_pos_query, iNeedle_deletion])
                            iNeedle_match_pos_query += iNeedle_deletion
                            iNeedle_deletion = 0
                        iNeedle_match_pos_query += 1
                        # print 'sRef_needle', sRef_needle

                # print 'sQuery_needle', sQuery_needle
                # print 'lInsertion_in_read: onebase', lInsertion_in_read
                # print 'lDeletion_in_read: onebase', lDeletion_in_read
                # print 'i5bp_front_Indel_end', i5bp_front_Indel_end
                # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos

                lTarget_indel_result = []  # ['20M2I', '23M3D' ...]
                """
                ins case
                ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNN*NNNNNAGCTT
                """

                iCleavage_window_start = int(lIndel_check_pos[0])
                iCleavage_window_end = int(lIndel_check_pos[1]) - 1

                for iMatch_pos, iInsertion_pos in lInsertion_in_read:
                    if iCleavage_window_start <= iMatch_pos <= iCleavage_window_end:  # iMatch_pos is one base
                        iInsert_count = 1
                        lTarget_indel_result.append(
                            str(iMatch_pos) + 'M' + str(iInsertion_pos) + 'I')
                """
                del case 1
                ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNAGCTT
                del case 2
                ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNNNCTT
                """
                for iMatch_pos, iDeletion_pos in lDeletion_in_read:
                    """
                    Insertion: 30M3I
                           ^
                    ACGT---ACGT
                    ACGTTTTACGT -> check this seq
                    Insertion just check two position

                    Deletion: 30M3D
                         ^
                    ACGTTTTACGT
                    ACGT---ACGT -> check this seq
                    But deletion has to includes overlap deletion.
                    """

                    if iMatch_pos <= iCleavage_window_end and iCleavage_window_start <= (
                            iMatch_pos + iDeletion_pos):
                        iDelete_count = 1
                        lTarget_indel_result.append(
                            str(iMatch_pos) + 'M' + str(iDeletion_pos) + 'D')

                if iInsert_count == 1 and iDelete_count == 1:
                    iComplex_count = 1
                    iInsert_count = 0
                    iDelete_count = 0

                    # """ test set
                    # print 'sBarcode', sBarcode
                    # print 'sTarget_region', sTarget_region
                    # print 'sRef_seq_after_barcode', sRef_seq_after_barcode
                    # print 'sSeq_after_barcode', sQuery_seq
                    # print 'iIndel_start_from_barcode_pos', iIndel_start_from_barcode_pos
                    # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos
                    # """
                    """
                    23M3I
                    23M is included junk_seq after barcode,

                    barcorde  junk   targetseq   others
                    *********ACCCT-------------ACACACACC
                    so should select target region.
                    If junk seq is removed by target region seq index pos.
                    """

                ## 8: indel info
                dResult[sBarcode][8].append([
                    sRef, sQuery_seq_raw, lTarget_indel_result, "",
                    sRef_needle_ori, sQuery_needle_ori
                ])  ## "" -> target seq, but this is not used this project.

            # end: try
            except ValueError as e:
                # print(e)
                continue

            # total matched reads, insertion, deletion, complex
            dResult[sBarcode][0] += iBarcode_matched
            dResult[sBarcode][1] += iInsert_count
            dResult[sBarcode][2] += iDelete_count
            dResult[sBarcode][3] += iComplex_count

            ## base editing frequency
            """
                   BaseEditPos : 0                                                    1                                  2
            [OrderedDict([('A',0),('C',0),('G',0),('T',0)]), OrderedDict([('A',0),('C',0),('G',0),('T',0)]), ...

            and sum the counts each position
            """

            if iInsert_count == 0 and iDelete_count == 0 and iComplex_count == 0:

                lBaseEdit = []
                iTarget_len = int(lTarget_window[1]) - int(
                    lTarget_window[0]) + 1

                for i in range(iTarget_len):
                    lBaseEdit.append(
                        OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]))

                iTarget_start = int(lTarget_window[0]) - 1
                iTarget_end = int(lTarget_window[1])
                """
                                       cleavage window start
                                        ^
                [barcode]ACGACGTACGACGT[cleavage]
                [barcode]ACGACGTACGACGT[cleavage]
                """

                iBase_edit_event = 0

                for i, tRef_Query_base in enumerate(
                        zip(sRef_needle[iTarget_start:iTarget_end],
                            sQuery_needle[iTarget_start:iTarget_end])):
                    sRef_base = tRef_Query_base[0]
                    sQuery_base = tRef_Query_base[1]

                    if sRef_base == '-' or sQuery_base == '-': continue

                    if sRef_base != sQuery_base and sQuery_base != 'N':
                        iBase_edit_event = 1
                        lBaseEdit[i][sQuery_base] += 1
                        # print(sQuery_needle)

                dResult[sBarcode][9].append(lBaseEdit)
                if iBase_edit_event == 1:
                    dResult[sBarcode][10].append([
                        sRef, sQuery_seq_raw, lTarget_indel_result,
                        [
                            list(orderedDict.values())
                            for orderedDict in lBaseEdit
                        ], sRef_needle_ori, sQuery_needle_ori
                    ])
                # dResult[sBarcode] = [0, 0, 0, 0, [], [], [], [], [], [BaseEdit_freq_data]]

            iBarcode_matched = 0
            iInsert_count = 0
            iDelete_count = 0
            iComplex_count = 0
            # end: for sBarcode, lCol_ref
        # end: for lCol_FASTQ
        return dResult
Example #37
0
    def __init__(self, root_dir, *args, **kwargs):

        # set the default configurations
        self._build_tree = False
        self._infer_gtr = True
        self._root = None
        self._mutation_rate = None
        self._relaxed_clock = None
        self._Tc = None

        # set the directory, containing files for the session
        self._root_dir = root_dir
        self._nwk = os.path.join(self._root_dir, in_tree)
        self._aln = os.path.join(self._root_dir, in_aln)
        self._meta = os.path.join(self._root_dir, in_meta)
        self._cfg = os.path.join(self._root_dir, in_cfg)

        # some parameters for the logger function should be set explicitly,
        # because we might need them before super.__init__(...) is called
        self.verbose = 5
        self.t_start = time.time()
        self._log_file = os.path.join(self._root_dir, log_filename)

        # read the JSON configuration file
        with open(self._cfg) as ff:
            config_dic = json.load(ff)

        # Compose the list of steps to perform,
        # # save this as dictionary to session_state.json file (used to update
        # web browser state)
        self._init_session_state(config_dic)
        #  build tree if necessary

        #  build tree if necessary
        if self._build_tree:

            try:
                self.logger("Building phylogenetic tree...", 1)
                self._advance_session_progress()
                res = build_tree(self._root_dir)
                if (res != 0):
                    raise RuntimeError(
                        "Error in tree builder FastTree. The method returned: "
                        + str(res))

            except Exception as e:
                s = str(e)
                self._session_error(
                    "Error occurred when building phylogenetic tree."
                    " Exception description: " + s)
                raise

        try:

            tree = Phylo.read(self._nwk, 'newick')
            aln = AlignIO.read(self._aln, 'fasta')
            #  read the metadata
            dates, metadata = self._read_metadata_from_file(self._meta)
            super(TreeTime, self).__init__(dates=dates,
                                           tree=tree,
                                           aln=aln,
                                           gtr=self._gtr,
                                           *args,
                                           **kwargs)

        except Exception as e:
            s = str(e)
            self._session_error("Error in TreeTime object initialization. "
                                " Exception description: " + s)
            raise

        self._metadata = metadata
Example #38
0
def simulate_missing(path_to_align, spp_info, prop_n):

    # Set path to alignment
    alignment_path = str(path_to_align)
    full_path = os.path.abspath(alignment_path)
    base_name = os.path.basename(full_path)
    dir_name = os.path.dirname(full_path)

    # Get species list
    spp_list = str(spp_info).split(",")

    # Get N proportions
    n_percents = [float(x) for x in str(prop_n).split(",")]

    # Read in alignment and prune to desired species if requested
    try:
        formats = {
            'nex': 'nexus',
            'nexus': 'nexus',
            'phy': 'phylip',
            'phylip-relaxed': 'phylip-relaxed',
            'phylip': 'phylip',
            'fa': 'fasta',
            'fasta': 'fasta'
        }

        fformat = formats[alignment_path.split('.')[-1]]
        raw_alignment = AlignIO.read(alignment_path, fformat)

    # If alignment cannot be read in, raise exception
    except:
        sys.exit("ERROR: Cannot process " + os.path.basename(alignment_path))

    # Get species from raw alignment
    raw_spp = []
    for seq_record in raw_alignment:
        raw_spp.append(str(seq_record.id))

    if all(elem in spp_list for elem in raw_spp):
        # Create dummy alignment
        global pruned_alignment
        pruned_alignment = raw_alignment[0:0]

        # Populate alignment by adding taxa sorted by taxon ID
        for i in range(0, len(spp_list)):

            spp_to_add = spp_list[i]
            spp_n_percent = n_percents[i]

            raw_index = raw_spp.index(spp_to_add)
            raw_id = raw_alignment[raw_index].id
            raw_seq = raw_alignment[raw_index].seq

            new_seq = add_n(raw_seq, spp_n_percent)
            pruned_alignment.add_sequence(str(raw_id), new_seq)

        # If resulting alignment is empty, raise exception
        if int(pruned_alignment.get_alignment_length()) == 0:
            sys.exit(
                "ERROR: Alignment processed, but appears to have no bases...")
        else:
            with open(
                    dir_name + '\\' + base_name.replace(
                        "." + alignment_path.split('.')[-1],
                        "_SimN." + alignment_path.split('.')[-1]),
                    "w") as handle:
                SeqIO.write(pruned_alignment, handle, "phylip")

    else:
        sys.exit("ERROR: Requested species not found in " +
                 os.path.basename(alignment_path) + "...")
Example #39
0
def ali_parser(alignment_file):
    

    # optional parameter  use score or hmmemit to choose seed sequence    #args.ali
    if args.mode == 'con':
        
        subprocess.check_output([hmmbuild_exe, '--symfrac', '0', args.output_path+args.file_name+'.hmm', args.ali])    
        subprocess.check_output([hmmemit_exe, '-c', '-o', args.output_path+'temp_seed_seq.fasta', args.output_path+args.file_name+'.hmm'])
        seed_seq_record = AlignIO.read(args.output_path+'temp_seed_seq.fasta', 'fasta')
        seed_seq_record = str(seed_seq_record[0].seq)
        seed_seq_record = SeqRecord(Seq(seed_seq_record), id=args.file_name+'-consensus')

        seed_to_ali_mapping = [x+1 for x in range(len(seed_seq_record))]


    elif args.mode == 'rep':

        full_ali_obj = AlignIO.read(alignment_file, "fasta")
        for record in full_ali_obj:
            record.seq = str(record.seq).replace('X','-')  # in case of 'X', replacing it with '-'



        raw_array = np.array([list(rec) for rec in full_ali_obj], order="F")  # Convert alignment object to np_array object
        col_array = np.transpose(raw_array)  # align_columns.shape == (#col, #raw), i.e. (#site, #seq)

        # compute the columne scores (as the proxy for seq coverage percentage)
        col_scores = []
        for col in col_array:
            unique, counts = np.unique(col, return_counts=True)
            if '-' in unique:
                col_scores.append(1.0 - float(dict(zip(unique, counts))['-'])/float(len(col)))

        # obtain the seq with the highest coverage in the alignment
        array_score = np.copy(raw_array).astype(object)

        j = 0  # seq count
        while j < len(array_score):
            i = 0  # col count
            while i < len(col_scores):
                if array_score[j][i] == '-': array_score[j][i] = 0.0
                else:
                    array_score[j][i] = float(col_scores[i])
            
                i += 1
            j += 1

        seq_score=np.zeros( j )
        #np_sum
        j = 0  # seq count
        while j < len(array_score):
            i = 0  # col count
            while i < len(col_scores):
                seq_score[j]=seq_score[j]+array_score[j][i]
            
                i += 1
            j += 1


        #obtaining seed_seq related info
        seed_seq_array = raw_array[np.argmax(seq_score)]
        seed_seq = "".join(seed_seq_array)
        for record in full_ali_obj:
            if str(record.seq) == seed_seq:
                seed_id = str(record.id)

        seed_to_ali_mapping = [0 if x == '-' else 1 for x in seed_seq_array]  # constructing a mapping list of '-'== 0 & amino acids == 1,2,3,... seed_seq indicating position in the original alignment)
        i = 0  # track the alignment position
        j = 1  # track the seed_seq position
        #print len(seed_seq)
        while i < len(seed_seq):
            if seed_to_ali_mapping[i] == 1: 
                seed_to_ali_mapping[i] = j
                i += 1
                j += 1
            else:
                i += 1
    
        #print '%d \n%s %s\n%s' % (np.amax(seq_score), seed_id, seed_seq, seed_to_ali_mapping)  # check
        
        seed_seq_record = SeqRecord(Seq(seed_seq.replace("-","")), id=seed_id)  # remove the 'gap' so it won't affect hmm_aa_freq mapping in the Main Processes
       
        SeqIO.write(seed_seq_record, args.output_path+"temp_seed_seq.fasta", "fasta")
    
    # run hmmscan
    domain_threshold = args.profile_threshold
    profile_threshold = args.domain_threshold
    subprocess.check_output([hmmscan_exe, '-o', args.output_path+'temp_hmmscan.out', '-E', profile_threshold ,'--domE', domain_threshold, args.database, args.output_path+'temp_seed_seq.fasta'])
    return (seed_seq_record, seed_to_ali_mapping)
    import Bio
    from Bio.Phylo import PhyloXML, NewickIO
    import argparse
    from Bio import Phylo
    from Bio import AlignIO
    from Bio.Phylo.Consensus import *
    from Bio.Phylo.TreeConstruction import DistanceCalculator
    from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("multSeqAln", help="", type=str)
    #parser.add_argument("out",help="")

    args = parser.parse_args()

    msa = AlignIO.read(args.multSeqAln, "fasta")

    #tree = Phylo.read(args.multSeqAln, "newick")
    #msas = bootstrap(msa, 100)

    calculator = DistanceCalculator('blosum62')
    constructor = DistanceTreeConstructor(calculator)
    #trees = bootstrap_trees(msa, 100, constructor)
    print "start bootstrap"
    consensus_tree = bootstrap_consensus(msa, 2, constructor,
                                         majority_consensus)
    print "bootstrap done"
    print consensus_tree
    consensus_tree.root_with_outgroup("Tamandua")
    Phylo.draw(consensus_tree)
Example #41
0
        lists[start]=1
        start+=1
    return lists
###################################################################################
###############################   MAIN PROCESSES   ################################
###################################################################################

seed_seq, seed_to_ali_mapping = ali_parser(args.ali)[0], ali_parser(args.ali)[1]
domains = hmmscan_parser(args.output_path+'temp_hmmscan.out', seed_seq.id)
hmm_database = {}

for key, value in domains.items():
#hmm_database = HMM(args.database, seed_seq.id).hmm_database  # Storing target hmm profile into a HMM Class which is structured as a dictionary
    hmm_database = HMM(args.database, key,hmm_database )

alimentObj = AlignIO.read(args.ali, "fasta")
aligmentObjLen=len(alimentObj[0])
local_seed_seq_master = [[]]*(aligmentObjLen+int(0.1*aligmentObjLen))
max_seed_seq_master = [[]]*aligmentObjLen

seed_seq_master = [[]]*(aligmentObjLen+int(0.1*aligmentObjLen)) #The master list storing aa_freqs for the entire seed_seq
domain_position_check=[0 for x in range(aligmentObjLen)]

##### LET HMM_database order as score
'''
domains_order = {}
dom_key=(domains.keys())


for i in dom_key[::-1]:
    domains_order[i]=domains[i]
Example #42
0
    aln_gen, aln_nat, score, _, _ = alignments[0]

    match_count = 0
    for i in range(len(aln_gen)):
        if aln_gen[i] != '-':
            if aln_gen[i] == aln_nat[i]:
                match_count += 1

    print(aln_gen)
    print(aln_nat)
    percent_identity = match_count / actual_length
    print(match_count, actual_length,
          f"{percent_identity*100:0.2f} % identity")

    # Export alignments
    # aln = AlignIO.read('alns/sample_aln.fasta','fasta')
    aln = AlignIO.read(aln_filename, 'fasta')
    p = view_alignment(aln, plot_width=800)
    export_svgs(p, filename="figs/" + sample_sp + ".svg")
    #     export_png(p, filename="figs/" + sample_sp + ".png")
    # pn.pane.Bokeh(p)
    # export_svgs(p, filename="figs/" + sample_sp + ".svg")

    closest_matches.append(aln_nat.replace('-', ''))
    closest_identities.append(percent_identity)

# write final file
df_func['closest_match'] = closest_matches
df_func['percent_identity'] = closest_identities
df_func.to_csv('alns/func_sps_matches.csv')
Example #43
0
def remove_low_cov_and_consensus_columns(alignment_file_in, minimal_cov,
                                         min_consensus, alignment_file_out):
    def remove_single_columns_from_msa(alignment_in, column_to_remove):

        alignment_column_l = alignment_in[:, :column_to_remove - 1]
        alignment_column_r = alignment_in[:, column_to_remove:]
        alignment_new = alignment_column_l + alignment_column_r

        return alignment_new

    def remove_multi_columns_from_msa(alignment_in, column_list):

        alignment_new = alignment_in
        removed_col_num = 0
        for column in sorted(column_list):
            alignment_new = remove_single_columns_from_msa(
                alignment_new, column - removed_col_num)
            removed_col_num += 1

        return alignment_new

    def remove_low_cov_columns(alignment, min_cov):

        # get columns with low coverage
        sequence_number = len(alignment)
        total_col_num = alignment.get_alignment_length()
        low_cov_columns = []
        n = 0
        while n < total_col_num:
            current_column = alignment[:, n]
            dash_number = current_column.count('-')
            gap_percent = (dash_number / sequence_number) * 100

            if gap_percent > min_cov:
                low_cov_columns.append(n + 1)

            n += 1

        # remove identified columns
        alignment_new = remove_multi_columns_from_msa(alignment,
                                                      low_cov_columns)
        #alignment_new = slice_string(alignment, low_cov_columns)

        return alignment_new

    def remove_low_consensus_columns(alignment, min_consensus):

        # get columns with low coverage
        sequence_number = len(alignment)
        total_col_num = alignment.get_alignment_length()
        low_css_columns = []
        n = 0
        while n < total_col_num:
            current_column = alignment[:, n]

            # get all aa in current column
            aa_list = set()
            for aa in current_column:
                aa_list.add(aa)

            # get maximum aa percent
            most_abundant_aa_percent = 0
            for each_aa in aa_list:
                each_aa_percent = (current_column.count(each_aa) /
                                   sequence_number) * 100
                if each_aa_percent > most_abundant_aa_percent:
                    most_abundant_aa_percent = each_aa_percent

            # if maximum percent lower than provided cutoff, add current column to low consensus column list
            if most_abundant_aa_percent < min_consensus:
                low_css_columns.append(n + 1)

            n += 1

        # remove identified columns
        alignment_new = remove_multi_columns_from_msa(alignment,
                                                      low_css_columns)
        #alignment_new = slice_string(alignment, low_css_columns)

        return alignment_new

    # read in alignment
    alignment = AlignIO.read(alignment_file_in, "fasta")

    # remove_low_cov_columns
    alignment_cov = remove_low_cov_columns(alignment, minimal_cov)

    # remove_low_consensus_columns
    alignment_cov_css = remove_low_consensus_columns(alignment_cov,
                                                     min_consensus)

    # write filtered alignment
    alignment_file_out_handle = open(alignment_file_out, 'w')
    for each_seq in alignment_cov_css:
        alignment_file_out_handle.write('>%s\n' % str(each_seq.id))
        alignment_file_out_handle.write('%s\n' % str(each_seq.seq))
    alignment_file_out_handle.close()