Exemple #1
1
def validateInputs(msa, tree=None):
	# Check for existence and proper FASTA formatting of input MSA
	try:
		msaHandle = open(msa, "rU")
	except:
		print '** HYPNO input error: Given MSA file location does not exist or is not accessible: '+msa
		sys.exit(1)
	try:
		AlignIO.parse(msaHandle, "fasta").next()
	except:
		print '** HYPNO input error: improper MSA file format, must be aligned FASTA or a2m format: '+msa
		sys.exit(1)	

	if tree:
		try:
			treeHandle = open(tree, "rU")
		except:
			print '** HYPNO input error: Given tree file location does not exist or is not accessible: '+tree
			sys.exit(1)
		try:
			Phylo.read(treeHandle, "newick")
		except:
			print '** HYPNO input error: improper tree file format, must be Newick format: '+msa
			sys.exit(1)

	if not internet_connected():
		print '** HYPNO connection error: Please connect to the internet to enable HYPNO remote database queries.'
		sys.exit(1)

	return 0
def main():
    args = get_args()
    # iterate through all the files to determine the longest alignment
    files = get_files(args.nexus)
    old_names = set()
    for f in files:
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                old_names.update([seq.name])
    #pdb.set_trace()
    name_map = abbreviator(old_names)
    for count, f in enumerate(files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        #filename = os.path.basename(f)
        #chromo_name = filename.split('.')[0]
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                new_seq_name = name_map[seq.name]
                new_align.add_sequence(new_seq_name, str(seq.seq))
        #pdb.set_trace()
        outf = os.path.join(args.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        except ValueError:
            pdb.set_trace()
        print count
Exemple #3
0
def GetExec():
    Recs = os.listdir(os.getcwd())

    newList=[]
    j = 0

    listdata=dict()
    k = 0
    while k < len(Recs):
        try:
            (name, ext) = os.path.splitext(Recs[k])
            typo = ''
            if ext in [".txt",".fas",".fasta"]:
                IORec = AlignIO.parse(Recs[k],'fasta')
                typo = 'fasta'
            elif ext in [".aln"]:
                IORec = AlignIO.parse(Recs[k],'clustal')
                typo = 'clustal'
            aNum = 1
            for align in IORec:
                newList.append([align,name])
                NumSeqs = 0
                for rec in align:
                    NumSeqs += 1
            
                listdata[j] = str(Recs[k]),aNum, NumSeqs,align.get_alignment_length(),str(typo)
                j += 1
                aNum += 1
        
        except IOError, e:
            print e

        k += 1
Exemple #4
0
    def check_AlignIO_to_EMBOSS(self, in_filename, in_format, skip_formats=(),
                                alphabet=None):
        """Can Bio.AlignIO write files seqret can read back?"""
        if alphabet:
            old_aligns = list(AlignIO.parse(in_filename, in_format, alphabet))
        else:
            old_aligns = list(AlignIO.parse(in_filename, in_format))

        formats = ["clustal", "phylip"]
        if len(old_aligns) == 1:
            formats.extend(["fasta", "nexus"])
        for temp_format in formats:
            if temp_format in skip_formats:
                continue
            # PHYLIP is a simple format which explicitly supports
            # multiple alignments (unlike FASTA).
            try:
                new_aligns = list(emboss_piped_AlignIO_convert(old_aligns,
                                                               temp_format,
                                                               "phylip"))
            except ValueError as e:
                # e.g. ValueError: Need a DNA, RNA or Protein alphabet
                # from writing Nexus files...
                continue
            try:
                self.assertTrue(compare_alignments(old_aligns, new_aligns))
            except ValueError as err:
                raise ValueError("Disagree on file %s %s in %s format: %s"
                                 % (in_format, in_filename, temp_format, err))
def check_simple_write_read(alignments, indent=" "):
    # print indent+"Checking we can write and then read back these alignments"
    for format in test_write_read_align_with_seq_count:
        records_per_alignment = len(alignments[0])
        for a in alignments:
            if records_per_alignment != len(a):
                records_per_alignment = None
        # Can we expect this format to work?
        if not records_per_alignment and format not in test_write_read_alignment_formats:
            continue

        print(indent + "Checking can write/read as '%s' format" % format)

        # Going to write to a handle...
        handle = StringIO()

        try:
            c = AlignIO.write(alignments, handle=handle, format=format)
            assert c == len(alignments)
        except ValueError as e:
            # This is often expected to happen, for example when we try and
            # write sequences of different lengths to an alignment file.
            print(indent + "Failed: %s" % str(e))
            # Carry on to the next format:
            continue

        # First, try with the seq_count
        if records_per_alignment:
            handle.flush()
            handle.seek(0)
            try:
                alignments2 = list(AlignIO.parse(handle=handle, format=format, seq_count=records_per_alignment))
            except ValueError as e:
                # This is BAD.  We can't read our own output.
                # I want to see the output when called from the test harness,
                # run_tests.py (which can be funny about new lines on Windows)
                handle.seek(0)
                raise ValueError("%s\n\n%s\n\n%s" % (str(e), repr(handle.read()), repr(alignments2)))
            simple_alignment_comparison(alignments, alignments2, format)

        if format in test_write_read_alignment_formats:
            # Don't need the seq_count
            handle.flush()
            handle.seek(0)
            try:
                alignments2 = list(AlignIO.parse(handle=handle, format=format))
            except ValueError as e:
                # This is BAD.  We can't read our own output.
                # I want to see the output when called from the test harness,
                # run_tests.py (which can be funny about new lines on Windows)
                handle.seek(0)
                raise ValueError("%s\n\n%s\n\n%s" % (str(e), repr(handle.read()), repr(alignments2)))
            simple_alignment_comparison(alignments, alignments2, format)

        if len(alignments) > 1:
            # Try writing just one Alignment (not a list)
            handle = StringIO()
            SeqIO.write(alignments[0], handle, format)
            assert handle.getvalue() == alignments[0].format(format)
 def initialize_data(self):
     self.motif1 = Motif(self.m1_file)
     motif = open(self.m1_file, 'r')
     for each in AlignIO.parse(motif, "fasta"):
         self.motif1.add_promoter(each)
     motif.close()
     
     self.motif2 = Motif(self.m2_file)
     motif = open(self.m2_file, 'r')
     for each in AlignIO.parse(motif, "fasta"):
         self.motif2.add_promoter(each)
     motif.close()
def getStuff(workAlignment, geneName):
        speciesList = []
        global speciesList
        def rmDot(string):
                #Removes dots
                return string.split(".")[0]
        def makeMatrix(species):
                #Makes the matrix that things can be stored in.
                stuffDict = {}
                for i in species:
                        stuffDict[i] = {}
                        for j in species:
                                stuffDict[i][j] = 0
                return stuffDict
        for i in AlignIO.parse(workAlignment, "maf"):
                #Creates a list of species to be used in a matrix.
                for q in xrange(len(str(i.get_column(0)))):
                        if rmDot(list(i)[q].id) not in speciesList:
                                speciesList.append(rmDot(list(i)[q].id))

        #runs the makeMatrix functionto create dictOfCOunts, which will hold all of the counts.
        dictOfCounts = makeMatrix(speciesList)
        dictOfSames = makeMatrix(speciesList)

        for i in AlignIO.parse(workAlignment, "maf"):
                #Loops through each block in the alignment.i is an Object that can be turned into a list or string and has several deprecated functions (which I use).
                columnDict = {}
                for j in xrange(i.get_alignment_length()):
                        #creates columnDict, a dictionary that contains a apecies name and a letter
                        global columnDict
                        column = i.get_column(j)
                        if '-' or "n" or "N" not in column:
                                y = 0
                                for k in column:
                                        columnDict[rmDot(list(i)[y].id)] = k
                                        y = y + 1
                        for g in columnDict:
                                #Populates dictOfCounts
                                for h in columnDict:
                                        if columnDict[h] != columnDict[g]:
                                                dictOfCounts[g][h]= dictOfCounts[g][h] + 1
                                        else:
                                                dictOfSames[g][h] = dictOfSames[g][h] + 1
        def fileWriter(path, data):
                pickle.dump(data, open(path, "wb"))
        fileWriter("data/" + geneName + "Diffs", dictOfCounts)
        fileWriter("data/" + geneName + "Sames", dictOfSames)
        fileWriter("data/emptyMatrix", makeMatrix(speciesList))
def convert_alignments(in_file, out_file, new_type):
    """converts a nexus alignment into a phylip alignment and writes to new file"""
    alignments = AlignIO.parse(open(in_file, 'r'), "nexus")
    for alignment in alignments:
        handle = open(out_file, "a")
        AlignIO.write(alignment, handle, new_type)
        handle.close()
Exemple #9
0
 def test_seqtmatchall_piped(self):
     """seqmatchall with pair output piped to stdout."""
     cline = SeqmatchallCommandline(cmd=exes["seqmatchall"],
                                    sequence="Fasta/f002",
                                    aformat="pair", wordsize=9,
                                    auto=True, stdout=True)
     self.assertEqual(str(cline),
                      exes["seqmatchall"] + " -auto -stdout"
                      + " -sequence=Fasta/f002"
                      + " -wordsize=9 -aformat=pair")
     # Run the tool,
     child = subprocess.Popen(str(cline),
                              stdin=subprocess.PIPE,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              universal_newlines=True,
                              shell=(sys.platform != "win32"))
     child.stdin.close()
     # Check we could read it's output
     for align in AlignIO.parse(child.stdout, "emboss"):
         self.assertEqual(len(align), 2)
         self.assertEqual(align.get_alignment_length(), 9)
     # Check no error output:
     self.assertEqual(child.stderr.read(), "")
     self.assertEqual(0, child.wait())
     child.stdout.close()
     child.stderr.close()
Exemple #10
0
def convert(infile, type, outtype, outfile):
	"""Make AlignIO call to convert using the specified parameters"""

	from Bio import AlignIO

	ifh = AlignIO.parse(infile, type)
	AlignIO.write(ifh, outfile, outtype)
Exemple #11
0
 def test_water_file3(self):
     """water with the asis trick and GenBank file, output to a file."""
     # Setup,
     query = "TGTTGTAATGTTTTAATGTTTCTTCTCCCTTTAGATGTACTACGTTTGGA"
     out_file = "Emboss/temp_test3.water"
     in_file = "GenBank/cor6_6.gb"
     self.assertTrue(os.path.isfile(in_file))
     if os.path.isfile(out_file):
         os.remove(out_file)
     cline = WaterCommandline(cmd=exes["water"])
     cline.set_parameter("asequence", "asis:%s" % query)
     cline.set_parameter("bsequence", in_file)
     # TODO - Tell water this is a GenBank file!
     cline.set_parameter("gapopen", "1")
     cline.set_parameter("gapextend", "0.5")
     cline.set_parameter("outfile", out_file)
     self.assertEqual(str(eval(repr(cline))), str(cline))
     # Run the tool,
     self.run_water(cline)
     # Check we can parse the output and it is sensible...
     self.pairwise_alignment_check(query,
                                   SeqIO.parse(in_file, "genbank"),
                                   AlignIO.parse(out_file, "emboss"),
                                   local=True)
     # Clean up,
     os.remove(out_file)
def parsexmfa(xmfa, r1, r2, ref):
    """
    """
    print("parsing...")
    r1 = r1
    r2 = r2
    consensusdict = {}
    gapfill = 0
    totalgaps = 0
    alignment = AlignIO.parse(open(xmfa), "mauve")
    for aln in alignment:  # each alignment block
        header = []
        if len(aln) > 1:
            for record in aln:
                header.append(record.id)
            for rec in header:
                if r1 in rec:
                    pos = rec.split("/")[1]
                    pos = pos.replace("-", ":")
                    alignarr = np.array([list(r) for r in aln], np.character)
                    sense, gap, fill = makesense(alignarr, header, r1, r2, ref)
                    gapfill += fill
                    totalgaps += gap
                    consensusdict[pos] = sense
    print("total gaps: {}\n total gaps filled: {}".format(totalgaps, gapfill))
    return(consensusdict)
Exemple #13
0
def align(hom):
    '''Takes in a homologue from getHomologues() and
    aligns all of the sequences that it contains'''
    with tempfile.NamedTemporaryFile() as temp_file:
        uid_map = {}
        for species in hom['species']:
            temp = hom['species'][species][0]
            temp_file.write('>' + str(temp['uid']) + '\n')
            temp_file.write(temp['seq'] + '\n')
            temp_file.write('\n')
            uid_map[temp['uid']] = species
        align_io_temp_file = StringIO.StringIO()
        cline = ClustalwCommandline("clustalw", infile=temp_file.name, align='true',output='PHYLIP')
        align_io_temp_file.write(cline)
        alignments = AlignIO.parse(align_io_temp_file, 'phylip')
    for alignment in alignments:
        #TODO: Don't throw out data here
        #get the first protien for the species
        temp = hom[uid_map[alignment.id]]['species'][0]
        #clear out all others since we only currently want one
        hom[uid_map[alignment.id]]['species'] = []
        #stick in the aligned sequence
        temp['seq'] = str(alignment.seq)
        #re append the protien
        hom[uid_map[alignment.id]]['species'].append(temp)
    seq_len = False
    for species in hom['species']:
        if not seq_len:
            seq_len = len(hom['species'][species][0]['seq'])
        if len(hom['species'][species][0]['seq']) != seq_len:
            raise Exception("ALIGNMENT ERROR")
    return hom
Exemple #14
0
def main():

    global args
    print "doop"
    d = "/home/nabil/IGORwork/safe.ALL.MAF"
    for multiple_alignment in AlignIO.parse(d, "maf"):
        print "new m"
Exemple #15
0
 def test_water_file2(self):
     """water with the asis trick and nucleotide FASTA file, output to a file."""
     # Setup,
     query = "ACACACTCACACACACTTGGTCAGAGATGCTGTGCTTCTTGGAAGCAAGGNCTCAAAGGCAAGGTGCACGCAGAGGGACGTTTGAGTCTGGGATGAAGCATGTNCGTATTATTTATATGATGGAATTTCACGTTTTTATG"
     out_file = "Emboss/temp_test2.water"
     in_file = "Fasta/f002"
     self.assertTrue(os.path.isfile(in_file))
     if os.path.isfile(out_file):
         os.remove(out_file)
     cline = WaterCommandline(cmd=exes["water"])
     cline.set_parameter("-asequence", "asis:%s" % query)
     cline.set_parameter("-bsequence", in_file)
     cline.set_parameter("-gapopen", "10")
     cline.set_parameter("-gapextend", "0.5")
     cline.set_parameter("-outfile", out_file)
     self.assertEqual(str(eval(repr(cline))), str(cline))
     # Run the tool,
     self.run_water(cline)
     # Check we can parse the output and it is sensible...
     self.pairwise_alignment_check(query,
                                   SeqIO.parse(in_file, "fasta"),
                                   AlignIO.parse(out_file, "emboss"),
                                   local=True)
     # Clean up,
     os.remove(out_file)
def main():
    args = get_args()
    # iterate through all the files to determine the longest alignment
    files = get_files(args.input)
    all_taxa = set([])
    for count, f in enumerate(files):
        #new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        new_align = MultipleSeqAlignment([], generic_dna)
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                #pdb.set_trace()
                fname = os.path.splitext(os.path.basename(f))[0]
                new_seq_name = re.sub("^{}_*".format(fname), "", seq.name)
                all_taxa.add(new_seq_name)
                seq.id = new_seq_name
                seq.name = new_seq_name
                new_align.append(seq)
        assert len(all_taxa) == args.taxa, "Taxon names are not identical"
        outf = os.path.join(args.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        except ValueError:
            pdb.set_trace()
        print count
    print "Taxon names in alignments: {0}".format(','.join(list(all_taxa)))
  def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename,remove_identical_sequences = 0):
	  
      taxa_to_remove = []
      if remove_identical_sequences < 1:	  
          taxa_to_remove = self.taxa_missing_too_much_data()
      else:
          taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data()
      
      with open(self.input_filename) as input_handle:
          with open(output_filename, "w+") as output_handle:
              alignments = AlignIO.parse(input_handle, "fasta")
              output_alignments = []
              
              number_of_included_alignments = 0
              for alignment in alignments:
                  for record in alignment:
                      
                      if record.id not in taxa_to_remove:
                          output_alignments.append(record)
                          number_of_included_alignments += 1
              
              if number_of_included_alignments <= 1:
                  sys.exit("Not enough sequences are left after removing duplicates.Please check you input data.")
              
              AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta")
              output_handle.close()
          input_handle.close()
      return taxa_to_remove
Exemple #18
0
 def test_needle_piped2(self):
     """needle with asis trick, and nucleotide FASTA file, output piped to stdout."""
     # TODO - Support needle in Bio.Emboss.Applications
     # (ideally with the -auto and -filter arguments)
     # Setup,
     query = "ACACACTCACACACACTTGGTCAGAGATGCTGTGCTTCTTGGAA"
     cline = exes["needle"]
     cline += " -asequence asis:" + query
     cline += " -bsequence Fasta/f002"
     cline += " -auto"  # no prompting
     cline += " -filter"  # use stdout
     # Run the tool,
     child = subprocess.Popen(str(cline),
                              stdin=subprocess.PIPE,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              universal_newlines=True,
                              shell=(sys.platform != "win32"))
     child.stdin.close()
     # Check we can parse the output and it is sensible...
     self.pairwise_alignment_check(query,
                                   SeqIO.parse("Fasta/f002", "fasta"),
                                   AlignIO.parse(child.stdout, "emboss"),
                                   local=False)
     # Check no error output:
     self.assertEqual(child.stderr.read(), "")
     self.assertEqual(0, child.wait())
     child.stdout.close()
     child.stderr.close()
 def check_bootstrap(self, filename, format, align_type="d"):
     """ check we can use fseqboot to pseudosample an alignment
     
     The align_type type argument is passed to the commandline object to
     set the output format to use (from [D]na,[p]rotein and [r]na )
     """
     self.assert_(os.path.isfile(filename), "Missing %s" % filename)
     cline = FSeqBootCommandline(exes["fseqboot"],
                                 sequence = filename,
                                 outfile =  "test_file",
                                 seqtype = align_type,
                                 reps = 2,
                                 auto = True, filter = True)
     return_code = run_command(cline)
     if return_code != 0:
         raise ValueError("Return code %s from:\n%s" \
                          % (return_code, str(cline)))
     # the resultant file should have 2 alignments...
     bs = list(AlignIO.parse(open("test_file", "r" ), format))
     self.assertEqual(len(bs), 2)
     # ..and each name in the original alignment...
     a_names = [s.name.replace(" ", "_") for s in
                AlignIO.read(open(filename, "r"), format)]
     # ...should be in each alignment in the bootstrapped file
     for a in bs:
         self.assertEqual(a_names, [s.name.replace(" ", "_") for s in a])
Exemple #20
0
  def filter_out_alignments_with_too_much_missing_data(input_filename, output_filename, filter_percentage,verbose):
    input_handle  = open(input_filename, "rU")
    output_handle = open(output_filename, "w+")
    alignments = AlignIO.parse(input_handle, "fasta")
    output_alignments = []
    taxa_removed = []
    number_of_included_alignments = 0
    for alignment in alignments:
        for record in alignment:
          number_of_gaps = 0
          number_of_gaps += record.seq.count('n')
          number_of_gaps += record.seq.count('N')
          number_of_gaps += record.seq.count('-')
          sequence_length = len(record.seq)

          if sequence_length == 0:
            taxa_removed.append(record.id)
            print "Excluded sequence " + record.id + " because there werent enough bases in it"
          elif((number_of_gaps*100/sequence_length) <= filter_percentage):
            output_alignments.append(record)
            number_of_included_alignments += 1
          else:
            taxa_removed.append(record.id)
            print "Excluded sequence " + record.id + " because it had " + str(number_of_gaps*100/sequence_length) +" percentage gaps while a maximum of "+ str(filter_percentage) +" is allowed"

    if number_of_included_alignments <= 1:
      sys.exit("Too many sequences have been excluded so theres no data left to work with. Please increase the -f parameter")

    AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta")
    output_handle.close()
    input_handle.close()
    return taxa_removed
Exemple #21
0
 def load(cls, filename, schema=None):
     try: 
         return [AlignmentExt.from_msa(msa) for msa in
                 AlignIO.parse(filename, cls.schema(filename, schema))]
     except Exception, e:
         print 'Unable to load alignments from: %s\n%s' % (filename, str(e))
         return None
def check_convert(in_filename, in_format, out_format, alphabet=None):
    # Write it out using parse/write
    handle = StringIO()
    aligns = list(AlignIO.parse(in_filename, in_format, None, alphabet))
    try:
        count = AlignIO.write(aligns, handle, out_format)
    except ValueError:
        count = 0
    # Write it out using convert passing filename and handle
    handle2 = StringIO()
    try:
        count2 = AlignIO.convert(in_filename, in_format, handle2, out_format, alphabet)
    except ValueError:
        count2 = 0
    assert count == count2
    assert handle.getvalue() == handle2.getvalue()
    # Write it out using convert passing handle and handle
    handle2 = StringIO()
    try:
        with open(in_filename) as handle1:
            count2 = AlignIO.convert(handle1, in_format, handle2, out_format, alphabet)
    except ValueError:
        count2 = 0
    assert count == count2
    assert handle.getvalue() == handle2.getvalue()
Exemple #23
0
def split_family_seqs():
    alis_dir = cfg.dataPath('rfam/family_alis/')
    meta_dir = cfg.dataPath('rfam/family_metas/')

    fopen = open(cfg.dataPath('rfam/Rfam.seed'))
    alis = aio.parse(fopen,'stockholm')
    while 1:
        infos = {}
        start = fopen.tell()
        while 1:
            l = fopen.readline()       
            if l == '': break
            if l[0] == '#':
                ukey = str(l[5:7])
                infos.update( [(ukey, infos.get(ukey,'') + l[8:])])
            
            else:
                if l.strip() != '': break
        
        
        fopen.seek(start)
        ali = alis.next()
        if not ali:
            break
        rfname = infos['AC'].strip()
        alifile = open(os.path.join(alis_dir, rfname+'.fa'),'w')
        metafile = open(os.path.join(meta_dir, rfname+'.pickle'),'w')

        aio.write(ali, alifile, 'fasta')
        pickle.dump(infos, metafile)

        alifile.close()
        metafile.close()
def _iterate_via_AlignIO(handle, format, alphabet):
    """Iterate over all records in several alignments (PRIVATE)."""
    from Bio import AlignIO

    for align in AlignIO.parse(handle, format, alphabet=alphabet):
        for record in align:
            yield record
Exemple #25
0
 def test_water_file3(self):
     """water with the asis trick and GenBank file, output to a file."""
     #Setup,
     query = "TGTTGTAATGTTTTAATGTTTCTTCTCCCTTTAGATGTACTACGTTTGGA"
     out_file = "Emboss/temp_test3.water"
     in_file = "GenBank/cor6_6.gb"
     self.assert_(os.path.isfile(in_file))
     if os.path.isfile(out_file) :
         os.remove(out_file)
     cline = WaterCommandline(cmd=exes["water"])
     cline.set_parameter("asequence", "asis:%s" % query)
     cline.set_parameter("bsequence", in_file)
     #TODO - Tell water this is a GenBank file!
     cline.set_parameter("gapopen", "1")
     cline.set_parameter("gapextend", "0.5")
     cline.set_parameter("outfile", out_file)
     self.assertEqual(str(eval(repr(cline))), str(cline))
     #Run the tool,
     result, out, err = generic_run(cline)
     #Check it worked,
     errors = err.read().strip()
     self.assert_(errors.startswith("Smith-Waterman local alignment"), errors)
     self.assertEqual(out.read().strip(), "")
     if result.return_code != 0 : print >> sys.stderr, "\n%s"%cline
     self.assertEqual(result.return_code, 0)
     self.assertEqual(result.get_result("outfile"), out_file)
     assert os.path.isfile(out_file)
     #Check we can parse the output and it is sensible...
     self.pairwise_alignment_check(query,
                                   SeqIO.parse(open(in_file),"genbank"),
                                   AlignIO.parse(open(out_file),"emboss"),
                                   local=True)
     #Clean up,
     os.remove(out_file)
Exemple #26
0
 def test_water_file2(self):
     """water with the asis trick and nucleotide FASTA file, output to a file."""
     #Setup,
     query = "ACACACTCACACACACTTGGTCAGAGATGCTGTGCTTCTTGGAAGCAAGGNCTCAAAGGCAAGGTGCACGCAGAGGGACGTTTGAGTCTGGGATGAAGCATGTNCGTATTATTTATATGATGGAATTTCACGTTTTTATG"
     out_file = "Emboss/temp_test2.water"
     in_file = "Fasta/f002"
     self.assert_(os.path.isfile(in_file))
     if os.path.isfile(out_file) :
         os.remove(out_file)
     cline = WaterCommandline(cmd=exes["water"])
     cline.set_parameter("-asequence", "asis:%s" % query)
     cline.set_parameter("-bsequence", in_file)
     cline.set_parameter("-gapopen", "10")
     cline.set_parameter("-gapextend", "0.5")
     cline.set_parameter("-outfile", out_file)
     self.assertEqual(str(eval(repr(cline))), str(cline))
     #Run the tool,
     result, out, err = generic_run(cline)
     #Check it worked,
     errors = err.read().strip()
     self.assert_(errors.startswith("Smith-Waterman local alignment"), errors)
     self.assertEqual(out.read().strip(), "")
     if result.return_code != 0 : print >> sys.stderr, "\n%s"%cline
     self.assertEqual(result.return_code, 0)
     self.assertEqual(result.get_result("outfile"), out_file)
     assert os.path.isfile(out_file)
     #Check we can parse the output and it is sensible...
     self.pairwise_alignment_check(query,
                                   SeqIO.parse(open(in_file),"fasta"),
                                   AlignIO.parse(open(out_file),"emboss"),
                                   local=True)
     #Clean up,
     os.remove(out_file)
Exemple #27
0
 def __init__(self, file_name=None, data = None, format='fasta'):
     if file_name:
         super(Alignment, self).__init__(AlignIO.read(file_name, format))
     elif data:
         super(Alignment, self).__init__(AlignIO.parse(StringIO(data), format))
     else:
         super(Alignment, self).__init__([])
Exemple #28
0
 def test_water_file4(self):
     """water with the asis trick and SwissProt file, output to a file."""
     # Setup,
     query = "DVCTGKALCDPVTQNIKTYPVKIENLRVMI"
     out_file = "Emboss/temp_test4.water"
     in_file = "SwissProt/sp004"
     self.assertTrue(os.path.isfile(in_file))
     if os.path.isfile(out_file):
         os.remove(out_file)
     cline = WaterCommandline(cmd=exes["water"])
     cline.set_parameter("-asequence", "asis:%s" % query)
     cline.set_parameter("-bsequence", in_file)
     # EMBOSS should work this out, but let's be explicit:
     cline.set_parameter("-sprotein", True)
     # TODO - Tell water this is a SwissProt file!
     cline.set_parameter("-gapopen", "20")
     cline.set_parameter("-gapextend", "5")
     cline.set_parameter("-outfile", out_file)
     self.assertEqual(str(eval(repr(cline))), str(cline))
     # Run the tool,
     self.run_water(cline)
     # Check we can parse the output and it is sensible...
     self.pairwise_alignment_check(query,
                                   SeqIO.parse(in_file, "swiss"),
                                   AlignIO.parse(out_file, "emboss"),
                                   local=True)
     # Clean up,
     os.remove(out_file)
def taxit_create(taxit_executable_loc,
                aln_fasta,
                hmm_file,
                tree_file,
                tree_stats,
                pfam_acc,
                output_location,
                aln_stockholm):
    '''
    Calls taxit
    '''
    #taxit create --clobber --aln-fasta ./PF14424.dedup.fasta --profile ./PF14424.wholefam.hmm --tree-file ./PF14424.dedup.nh  --locus PF14424 --package-name PF14424.pplacer
    cmd = taxit_executable_loc \
        + " create --clobber" \
        + " --aln-fasta " + aln_fasta \
        + " --profile " + hmm_file \
        + " --tree-file " + tree_file \
        + " --tree-stats " + tree_stats \
        + " --locus " + pfam_acc \
        + " --package-name " + output_location
    raw_data = subprocess.check_call(cmd, shell=True)   
    input_handle = open(aln_fasta, "rU")
    output_handle = open(aln_stockholm, "w")
    alignments = AlignIO.parse(input_handle, "fasta")
    AlignIO.write(alignments, output_handle, "stockholm")
    output_handle.close()
    input_handle.close()
Exemple #30
0
def emboss_piped_AlignIO_convert(alignments, old_format, new_format):
    """Run seqret, returns alignments (as a generator)."""
    # Setup, this assumes for all the format names used
    # Biopython and EMBOSS names are consistent!
    cline = SeqretCommandline(exes["seqret"],
                              sformat=old_format,
                              osformat=new_format,
                              auto=True,  # no prompting
                              filter=True)
    # Run the tool,
    child = subprocess.Popen(str(cline),
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             universal_newlines=True,
                             shell=(sys.platform != "win32"))
    try:
        AlignIO.write(alignments, child.stdin, old_format)
    except Exception as err:
        child.stdin.close()
        child.stderr.close()
        child.stdout.close()
        raise
    child.stdin.close()
    child.stderr.close()
    # TODO - Is there a nice way to return an iterator AND
    # automatically close the handle?
    try:
        aligns = list(AlignIO.parse(child.stdout, new_format))
    except Exception as err:
        child.stdout.close()
        raise
    child.stdout.close()
    return aligns
Exemple #31
0
    def combine_fastas(self, leaf_node_filename, internl_node_filename,
                       output_file):
        with open(output_file, 'w') as output_handle:
            # print out leafnodes as is
            with open(leaf_node_filename, 'r') as input_handle:
                alignments = AlignIO.parse(input_handle, "fasta")
                AlignIO.write(alignments, output_handle, "fasta")
                input_handle.closed

            with open(internl_node_filename, 'r') as input_handle:
                alignments = AlignIO.parse(input_handle, "fasta")
                output_alignments = []
                for alignment in alignments:
                    for record in alignment:
                        record.id = self.internal_node_prefix + str(record.id)
                        record.description = ''
                        output_alignments.append(record)

                AlignIO.write(MultipleSeqAlignment(output_alignments),
                              output_handle, "fasta")
                input_handle.closed
                output_handle.closed
Exemple #32
0
def trim_sequences(filepath: str) -> List:
    record = list(AlignIO.parse(filepath, "msf"))
    result_record = []
    for rec, num in zip(record[1:], range(len(record[1:]))):
        canonical_ = rec[0]
        canonical_len = len(canonical_.seq)

        result = percent_id_calc(canonical_, rec[1], f"{rec[0].id[:7]}_{num}",
                                 canonical_len)

        if result:
            result_record.append(result)
    return result_record
Exemple #33
0
def get_alignment_sequences_amount(input_path):
    input_format = get_format(input_path)
    if input_format == "fasta":
        alignment = SeqIO.parse(open(input_path), input_format)
    else:
        alignment = AlignIO.parse(open(input_path), input_format)
    seq_num = 0
    while True:
        try:
            record = next(alignment)
            seq_num += 1
        except:
            return seq_num
Exemple #34
0
    def seq_trimmer(self):
        needle_record = list(AlignIO.parse(self.out_dir / "water.fasta",
                                           "msf"))
        self.result_record = []

        for rec in needle_record[1:]:
            reference_seq = rec[0]
            seq_parser = IdenticalSequencesParser(reference_seq, rec[1],
                                                  self.id_score)

            result = seq_parser.highly_identical_seqs()
            if result:
                self.result_record.append(result)
Exemple #35
0
    def are_sequence_names_unique(self):
        with open(self.input_filename) as input_handle:
            alignments = AlignIO.parse(input_handle, "fasta")
            sequence_names = []
            for alignment in alignments:
                for record in alignment:
                    sequence_names.append(record.name)

            if [k for k, v in list(Counter(sequence_names).items()) if v > 1
                ] != []:
                return False
            input_handle.close()
        return True
Exemple #36
0
def main():
    args = get_args()
    nexus_files = get_files(args.input)
    taxa = get_all_taxon_names(nexus_files)
    taxa_to_keep = get_samples_to_run(args, taxa)
    for count, align_file in enumerate(nexus_files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        for align in AlignIO.parse(align_file, "nexus"):
            for taxon in list(align):
                if taxon.name in taxa_to_keep:
                    new_align.add_sequence(taxon.name, str(taxon.seq))
        outf = os.path.join(args.output, os.path.basename(align_file))
        AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        print count
Exemple #37
0
def maf(in_maf, out_paf):
    maf_f = None
    try:
        with open(out_paf, "w") as paf:
            maf_f = AlignIO.parse(in_maf, "maf")
            for grp in maf_f:
                seqs = []
                for seq in grp:
                    seqs.append(seq)
                matches = 0
                for i in range(0, len(seqs[0])):
                    if seqs[0][i] == seqs[1][i]:
                        matches += 1
                tannots = seqs[0].annotations
                qannots = seqs[1].annotations
                tlen = tannots["srcSize"]
                tstart = tannots["start"]
                tend = tstart + tannots["size"]
                if tannots["strand"] == -1:
                    tstart = tlen - tstart
                    tend = tlen - tend
                qlen = qannots["srcSize"]
                qstart = qannots["start"]
                qend = qannots["start"] + qannots["size"]
                if qannots["strand"] == -1:
                    qstart = qlen - qstart
                    qend = qlen - qend
                strand = "+" if tannots["strand"] == qannots["strand"] else "-"
                paf.write(
                    "{qname}\t{qlen}\t{qstart}\t{qend}\t{strand}\t{tname}\t{tlen}\t{tstart}\t{tend}\t{matches}\t"
                    "{block_len}\t255\n".format(
                        tname=seqs[0].id,
                        tlen=tlen,
                        tstart=tstart,
                        tend=tend,
                        qname=seqs[1].id,
                        qlen=qlen,
                        qstart=qstart if strand == "+" else qend,
                        qend=qend if strand == "+" else qstart,
                        strand=strand,
                        matches=matches,
                        block_len=tannots["size"]))
    except:
        traceback.print_exc()
        if maf_f is not None:
            maf_f.close()
        return False
    else:
        maf_f.close()
        return True
Exemple #38
0
def convert_nexus_to_format(dataset_as_nexus, dataset_format):
    """
    Converts nexus format to Phylip and Fasta using Biopython tools.

    :param dataset_as_nexus:
    :param dataset_format:
    :return:
    """
    fake_handle = StringIO(dataset_as_nexus)
    nexus_al = AlignIO.parse(fake_handle, 'nexus')
    tmp_file = make_random_filename()
    AlignIO.write(nexus_al, tmp_file, dataset_format)
    dataset_as_fasta = read_and_delete_tmp_file(tmp_file)
    return dataset_as_fasta
Exemple #39
0
    def parse_alignment(self, f):
        alignments = AlignIO.parse(f, 'clustal')
        for msa in alignments:
            self.length = msa.get_alignment_length()
            self.hydropathies = get_average_hydropathies(msa,
                                                         window=self.window,
                                                         kernel=self.kernel)
            self.amphipathicities = get_average_amphipathicities(
                msa, window=self.window)
            self.similarities = get_similarities(msa)
            self.tmcenters = get_tmcenters(msa)

            #what's the worst that can happen???
            return msa
Exemple #40
0
def load_alignments(alignmentfiles, format):
    alignments = []
    for file in alignmentfiles:
        try:
            for alignment in AlignIO.parse(file, format=format):
                logger.debug("loaded alignment of length {} from {}".format(
                    len(alignment), file))
                alignments.append(alignment)
        except ValueError as e:
            logger.error("Cannot parse input file {}: {}".format(file, e))
            raise
    logger.info("Successfully loaded {} alignments from {} input files".format(
        len(alignments), len(alignmentfiles)))
    return alignments
Exemple #41
0
def usePhyMLForBranchLengths(alignmentFasta, newicktree):
    #converts the alignment file in FASTA to PHYLIP format
    alnFastaInFH = getInputTempFile(alignmentFasta)
    alnPhylipOutFH = getOutputTempFile()

    input_handle = open(alnFastaInFH.name, "rU")
    output_handle = open(alnPhylipOutFH.name, "w")

    alignments = AlignIO.parse(input_handle, "fasta")
    AlignIO.write(alignments, output_handle, "phylip")

    input_handle.close()
    output_handle.close()
    treeInFH = getInputTempFile(newicktree)

    #removes the output files to be created if they already exist
    if os.path.exists(alnPhylipOutFH.name + "_phyml_stats.txt"):
        os.system("rm %s_phyml_stats.txt" % (alnPhylipOutFH.name))
    if os.path.exists(alnPhylipOutFH.name + "_phyml_tree.txt"):
        os.system("rm %s" % (alnPhylipOutFH.name + "_phyml_tree.txt"))

    #spawns a process and executes all the correct input keys when prompted
    child = pexpect.spawn("phyml")
    child.expect(". Enter the sequence file name >")
    child.sendline(alnPhylipOutFH.name)
    child.sendline("D")
    child.sendline("+")
    child.sendline("+")
    child.sendline("O")
    child.sendline("U")
    child.sendline("Y")
    child.expect(". Enter the name of the input tree file >")
    child.sendline(treeInFH.name)

    #checks for how long the calculation has been going for, if it has stalled, then the process will be halted
    argc = 0
    startTime = time.time()
    B = True
    while child.isalive() and B:
        argc += 1
        if argc % 1000 == 0:
            newTime = time.time() - startTime
            if newTime > 60.0:
                B = False

    Ret = None
    if B:
        Ret = open(alnPhylipOutFH.name + "_phyml_tree.txt", "r").read()
    #returns the branch lengthed tree if the process was successful
    return [B, Ret]
Exemple #42
0
def convert_fasta_to_phylip(input_path, output_path, blank=False):
    if blank:
        intermediata_path = input_path.replace(".fas", ".translated_fasta")
        names_translator_path = input_path.replace(".fas", ".names_map")
        res = convert_sequences_names(input_path, intermediata_path, names_translator_path, src="FastML")
        input_handle = open(intermediata_path, "rU")
    else:
        input_handle = open(input_path, "rU")
    output_handle = open(output_path, "w")
    alignments = AlignIO.parse(input_handle, "fasta")
    AlignIO.write(alignments, output_handle, "phylip-relaxed")
    input_handle.close()
    output_handle.close()
    return 0
Exemple #43
0
def runPhyML(aln, phymlOpt, geneDir):
	"""
	Function converting fasta file to phylip and running PhyML.

	@param1 aln: Path
	@param2 geneDir: Gene directory
	@return outPhy: Path to PhyML results file
	"""
	# convert to Phylip format and replace eventual "!" symbols (relic from using MACSE)
	origin = os.getcwd()
	os.chdir(geneDir)
	outPhy = aln.split("/")[-1].split(".")[0]+".phylip"
	#aln = aln.split("/")[-1]
	tmp = aln.split("/")[-1].split(".")[0]+".tmp"
	
	logger = logging.getLogger("main.tree")
	
	with open(aln, "rU") as aln2:
          laln = aln2.read().replace("!", "N")
          aln2.close()
          with open(tmp, "w") as temp:
            temp.write(laln)
            temp.close()
            
	input_handle = open(tmp, "rU")
	output_handle = open(outPhy, "w")
	
	alignments = AlignIO.parse(input_handle, "fasta")
	AlignIO.write(alignments, output_handle, "phylip-relaxed")

	output_handle.close()
	input_handle.close()
	os.remove(tmp)
	
	# PhyML
	if phymlOpt != "":
		try:
			opt=phymlOpt.split("ALN ")[1]
			logger.debug("phyml -i {:s} {}".format(outPhy, opt))
			cmd("phyml -i {:s} {}".format(outPhy, opt), False)
		except:
			logger.info("PhyML couldn't run with the provided info {}, running with default options.".format(phymlOpt))
			cmd("phyml -i {:s} -v e -b -2".format(outPhy), False)
	else:
		logger.debug("phyml -i {:s} -v e -b -2".format(outPhy))
		cmd("phyml -i {:s} -v e -b -2".format(outPhy), False)
		
	os.chdir(origin)
	
	return(geneDir+outPhy)
Exemple #44
0
def hssp3_file_to_phylip(hssp3_file_name, phylip_file_name, chain_id,
                         master_sequence):
    """Reads a HSSP file in stockholm format and writes a new msa file in phylip-sequential format
    only containing the given chain"""
    alignments = list(AlignIO.parse(hssp3_file_name, format='stockholm'))
    for align in alignments:
        if align[0].name[4] == '/':
            chain = align[0].name[5].upper()
            if chain == chain_id:
                align[0].id = align[0].name = align[0].description = 'MASTER'
                #align[0].seq = align[0].seq.ungap('-')
                AlignIO.write(align,
                              phylip_file_name,
                              format='phylip-sequential')
Exemple #45
0
 def check_EMBOSS_to_AlignIO(self, filename, old_format, skip_formats=()):
     """Check AlignIO can read seqret's conversion of the file."""
     self.assertTrue(os.path.isfile(filename), filename)
     old_aligns = list(AlignIO.parse(filename, old_format))
     formats = ["clustal", "phylip", "ig", "msf"]
     if len(old_aligns) == 1:
         formats.extend(["fasta", "nexus"])
     for new_format in formats:
         if new_format in skip_formats:
             continue
         handle = emboss_convert(filename, old_format, new_format)
         try:
             new_aligns = list(AlignIO.parse(handle, new_format))
         except Exception:  # TODO - Which exceptions?
             handle.close()
             raise ValueError("Can't parse %s file %s in %s format." %
                              (old_format, filename, new_format))
         handle.close()
         try:
             self.assertTrue(compare_alignments(old_aligns, new_aligns))
         except ValueError as err:
             raise ValueError("Disagree on %s file %s in %s format: %s" %
                              (old_format, filename, new_format, err))
Exemple #46
0
def parse_output(file, output):
    """Parses seq-gen output (phylip) into fasta file(s)
    """
    assert file
    assert output

    strio = StringIO.StringIO(output)
    phylip = list(AlignIO.parse(strio, 'phylip-sequential'))

    if len(phylip) > 1:
        for i, msa in enumerate(phylip):
            AlignIO.write(msa, '%i.%s' % (i, file), 'fasta')
    else:
        AlignIO.write(phylip[0], file, 'fasta')
Exemple #47
0
def pairwisealign(seq1, seq2, **kwargs):
    """
    Globally align two sequences.

    :param seq1: Sequence 1
    :type seq1: str
    :param seq2: Sequence 2
    :type seq2: str

    :parm AA: True if protein sequences, false otherwise.
    :param gapopen: The cost for opening a gap.
    :param gapextend: The cost for extending a gap.

    :returns: Sequence 1 aligned, sequence 2 aligned
    :rtype: tuple
    """

    if kwargs['AA']:
        flag1 = '-sprotein1'
        flag2 = '-sprotein2'
        matrix = 'EBLOSUM62'
    else:
        flag1 = '-snucleotide1'
        flag2 = '-snucleotide2'
        matrix = 'EDNAFULL'

    outfile = tempfile.NamedTemporaryFile()

    callstr = 'stretcher -outfile=%s -asequence=asis:%s \
        -bsequence=asis:%s -gapopen=%s -gapextend=%s -aformat fasta \
        -datafile %s %s %s' % (outfile.name, seq1, seq2, kwargs['gapopen'],
                               kwargs['gapextend'], matrix, flag1, flag2)

    status, output = getstatusoutput(callstr)

    if status == 0:
        result = AlignIO.parse(outfile, 'fasta')
        alignmentobj = result.next()
        outfile.close()
        getoutput('rm %s' % outfile.name)
    else:
        print output
        print "There was an error in pairwisealign.\
                Is stretcher installed? See above output and check out"                                                                       , \
            outfile.name
        print callstr
        outfile.close()
        raise SystemError

    return alignmentobj[0].seq._data, alignmentobj[1].seq._data
def remove_similars(ident):
    """Remove sequences with desired % identity"""

    # Input multi-seq FASTA format

    validated_sequence, validated_id = [], []
    for record in SeqIO.parse("../input.fasta", "fasta"):
        validated_sequence.append(record.seq)
        validated_id.append(record.id.split(" ")[0])

    remove_id = []

    for i in range(0, len(validated_sequence) - 1):

        if len(validated_sequence[i]) > 10:

            if validated_id[i] not in remove_id:
                for j in range(i + 1, len(validated_sequence)):
                    if len(validated_sequence[j]) > 10:
                        needle_cline = NeedleCommandline(
                            asequence="asis:" + validated_sequence[i],
                            bsequence="asis:" + validated_sequence[j],
                            gapopen=10,
                            gapextend=0.5,
                            outfile='stdout')
                        stdout, stderr = needle_cline()
                        alignment = AlignIO.parse(StringIO(stdout), "emboss")
                        for needle_records in alignment:
                            query = list(needle_records[0].seq)
                            subject = list(needle_records[1].seq)
                            matches = [
                                h for h, k in zip(query, subject) if h == k
                            ]
                            while '-' in matches:
                                matches.remove('-')
                            similarity = (float(len(matches)) /
                                          len(query)) * 100

                            if similarity >= ident:
                                remove_id.append(validated_id[j])
                                remove_id = list(set(remove_id))
        else:
            remove_id.append(validated_id[i])
            remove_id = list(set(remove_id))

    records = (r for r in SeqIO.parse("../input.fasta", "fasta")
               if r.id.split(" ")[0] not in remove_id)
    SeqIO.write(records, "input2.fasta", "fasta")
    os.rename("input2.fasta", "../output.fasta")
Exemple #49
0
def main():
    import sys
    from Bio import AlignIO

    if len(sys.argv) != 5:
        sys.exit('python3 %s <in.mfa> <out.mfa> <inFormat> <outFormat>' %
                 (sys.argv[0]))

    inFile = sys.argv[1]
    outFile = sys.argv[2]
    inFormat = sys.argv[3]
    outFormat = sys.argv[4]

    alignments = AlignIO.parse(inFile, inFormat)
    AlignIO.write(alignments, outFile, outFormat)
Exemple #50
0
def align(genes):
    path = config.get_binary_path('mafft')
    if not path:
        raise exceptions.BinaryNotFound('MAFFT binary was not found')
    with tempfile.NamedTemporaryFile(mode='w+t', suffix='.fasta') as tmp:
        utils.write_genome(genes, tmp)
        tmp.flush()
        mafft_cmd = MafftCommandline(path, input=tmp.name, quiet=True)
        stdout, _ = mafft_cmd()
        records = AlignIO.parse(StringIO(stdout), "fasta")
        try:
            alignments = next(records)
        except StopIteration:
            return {}
        return utils.records_to_dict(alignments)
Exemple #51
0
def parse_many_alignments(infiles, fmt='fasta'):
    """
    Iterate either over multiple files, or over one single file split by '//'
    """
    current = ''
    if not infiles or (len(infiles) == 1
                       and infiles[0] in ('-', '/dev/stdin')):
        for line in stdin:
            if line.startswith('//'):
                yield AlignIO.read(StringIO(current), fmt)
                current = ''
            else:
                current += line
    else:
        for infile in infiles:
            yield from AlignIO.parse(infile, fmt)
Exemple #52
0
def main(args=None):

    if args is None:
        args = argv[1:]

    parser = ArgumentParser(description='convert stockholm file to FASTA')
    parser.add_argument('STOCKHOLMFILE', type=FileType('r'))
    ns = parser.parse_args(args)

    alignments = AlignIO.parse(ns.STOCKHOLMFILE, 'stockholm')
    AlignIO.write(alignments, stdout, 'fasta')

    if ns.STOCKHOLMFILE != stdin:
        ns.STOCKHOLMFILE.close()

    return 0
Exemple #53
0
def convertFasta2Phylip(instring, outstring):
    """

    :param instring: in fasta
    :param outstring: out phylip handle
    :return: basic phylip
    """

    input_handle = open(instring, "rU")
    output_handle = open(outstring, "w")

    alignments = AlignIO.parse(input_handle, "fasta")
    AlignIO.write(alignments, output_handle, "phylip")

    output_handle.close()
    input_handle.close()
Exemple #54
0
 def _createObjectBasedOnFile(self, filePath):
     if not _BioAvailable:
         return filePath
     conversionResult = []
     file = open(filePath, 'rU')
     fileFormatName = u2py_internals.detectFormat(filePath)
     if fileFormatName in Serializer._seqRecordsFileFormats:
         conversionResult = list(SeqIO.parse(file, fileFormatName))
         self.createdFiles[filePath] = conversionResult
     elif fileFormatName in Serializer._msaFileFormats:
         conversionResult = list(AlignIO.parse(file, fileFormatName))
         self.createdFiles[filePath] = conversionResult
     else:
         conversionResult = filePath
     file.close()
     return conversionResult
Exemple #55
0
    def hash_sequences(self):
        sequence_hash_to_taxa = defaultdict(list)
        with open(self.input_filename) as input_handle:
            alignments = AlignIO.parse(input_handle, "fasta")
            for alignment in alignments:
                for record in alignment:
                    sequence_hash = hashlib.md5()
                    sequence_hash.update(str(record.seq).encode('utf-8'))
                    hash_of_sequence = sequence_hash.digest()
                    sequence_hash_to_taxa[hash_of_sequence].append(record.id)

                    if self.verbose:
                        print("Sample " + str(record.id) + " has a hash of " +
                              str(hash_of_sequence))
        input_handle.close()
        return sequence_hash_to_taxa
Exemple #56
0
def main():
    # parse command line options
    usage = "%prog [options] bpg_accession"
    opt_parser = OptionParser(usage=usage)
    (options, args) = opt_parser.parse_args()
    if len(args) != 1:
        opt_parser.error('Incorrect number of arguments')
    if len(args[0]) < 10 or args[0][0:3] != 'bpg':
        opt_parser.error('Argument must be a bpg accession like bpg0123456')
    bpg_accession = args[0]
    try:
        family_id = int(bpg_accession[3:])
    except ValueError:
        opt_parser.error('Argument must be a bpg accession like bpg0123456')
    family_dir = '/clusterfs/ohana/bpg/pfacts/%s/%s/%s' % (
        bpg_accession[0:4], bpg_accession[0:7], bpg_accession)
    if not os.path.exists(family_dir):
        opt_parser.error('Family %s not found on the filesystem.' %
                         bpg_accession)
    family = Family.objects.get(id=family_id)
    if family.status == 'bad':
        opt_parser.error('Family %s is marked as bad in the database.'  \
                          % bpg_accession)

    alignment_file = os.path.join(family_dir, '%s.a2m' % bpg_accession)
    f = open(alignment_file)
    for alignment in AlignIO.parse(f, "fasta"):
        break

    alignment_seqs, aligned_column_indices \
        = get_alignment_seqs_and_aligned_column_indices(alignment)
    column_conserved_residue, column_score \
        = get_conservation_info(alignment_seqs, aligned_column_indices)

    outfname = os.path.join(family_dir,
                            bpg_accession + '.alignmentconservation.csv')
    outf = open(outfname, 'w')
    outf.write('ColumnIndex,ConservedResidue,Blosum62ConservationScore\n')
    for j in aligned_column_indices:
        outf.write('%d,%s,%f\n' %
                   (j, column_conserved_residue[j], column_score[j]))
    outf.close()

    root = family.canonical_root_node()
    update_tree_node_alignment_conservation(root, aligned_column_indices,
                                            column_conserved_residue,
                                            column_score)
Exemple #57
0
def pseudofam_convert_stockholm_to_fasta(input_file_name,
                                         output_file_name=None):
    from Bio import AlignIO

    if output_file_name == None:
        output_file_name = input_file_name.rstrip(".sto") + ".fasta"

    input_handle = open(
        input_file_name, "rU"
    )  #HJ: Apparently U has no affect, so this indicates it will only read (for "r").
    output_handle = open(output_file_name, "w")

    alignments = AlignIO.parse(input_handle, "stockholm")
    AlignIO.write(alignments, output_handle, "fasta")

    output_handle.close()
    input_handle.close()
Exemple #58
0
def get_fraction(position, msa_path, chars=True):
    # print("get_fraction called with chars = " + str(chars)) # debug
    sys.path.insert(0, '/groups/itay_mayrose/halabikeren/python_scripts/')
    from utils.alterAlignments import get_format, get_alignment_sequences_amount
    msa_format = get_format(msa_path)
    seq_num = get_alignment_sequences_amount(msa_path)
    chars_counter = 0
    alignments = AlignIO.parse(open(msa_path), msa_format)
    alignment = next(alignments)
    for record in alignment:
        sequence = str(record.seq)
        if sequence[position - 1] != "-":
            chars_counter += 1
    chars_fraction = float(chars_counter / seq_num)
    if not chars:
        return float(1 - chars_fraction)
    return chars_fraction
def main():
    args = get_args()
    # iterate through all the files to determine the longest alignment
    files = get_files(args.input, args.input_format)
    for count, f in enumerate(files):
        new_records = []
        for align in AlignIO.parse(f, args.input_format):
            for oldseq in list(align):
                seqstr = str(oldseq.seq)
                #pdb.set_trace()
                new_seq = re.sub("[acgtn]", "", seqstr)
                new_seq = re.sub("-", "", new_seq)
                new_seq_record = SeqRecord(Seq(new_seq, generic_dna), id=oldseq.id, name=oldseq.name, description=oldseq.description)
                new_records.append(new_seq_record)
        outf = os.path.join(args.output, os.path.split(f)[1])
        SeqIO.write(new_records, open(outf, 'w'), 'fasta')
        print count
def main():
    options, args = interface()
    # iterate through all the files to determine the longest alignment
    files = get_files(options.input)
    pos1, pos2 = [eval(i) for i in options.positions.strip().split(',')]
    for count, f in enumerate(files):
        align = AlignIO.parse(f, "nexus")
        new_name = os.path.splitext(
            os.path.split(f)[1])[0] + '.' + options.format
        outf = os.path.join(options.output, new_name)
        if options.shorten_name:
            for item in rename(align, pos1, pos2, options.splitchar):
                AlignIO.write(item, open(outf, 'w'), options.format)
        else:
            AlignIO.write(align, open(outf, 'w'), options.format)

        print count