def add_gaps_to_align(organisms, missing, align, verbatim=False, genera=False, min_taxa=3):
    local_organisms = copy.deepcopy(organisms)
    for a in align:
        if len(a) < min_taxa:
            new_align = None
            break
        elif len(a) >= min_taxa:
            #pdb.set_trace()
            new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
            overall_length = len(a[0])
            for seq in a:
                if genera and any(sp for sp in genera if sp in seq.name):
                    new_seq_name = '_'.join(seq.name.split('_')[-1:])
                elif not verbatim:
                    new_seq_name = '_'.join(seq.name.split('_')[-2:])
                else:
                    new_seq_name = seq.name.lower()
                new_align.add_sequence(new_seq_name, str(seq.seq))
                local_organisms.remove(new_seq_name)
            for org in local_organisms:
                if genera and any(sp for sp in genera if sp in seq.name):
                    loc = '_'.join(seq.name.split('_')[:-1])
                elif not verbatim:
                    loc = '_'.join(seq.name.split('_')[:-2])
                else:
                    loc = seq.name
                if missing:
                    try:
                        assert loc in missing[org], "Locus missing"
                    except:
                        assert loc in missing['{}*'.format(org)], "Locus missing"
                new_align.add_sequence(org, '?' * overall_length)
    return new_align
Example #2
0
def main():
    options, args = interface()
    # iterate through all the files to determine the longest alignment
    files = get_files(options.input)
    for count, f in enumerate(files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        #filename = os.path.basename(f)
        #chromo_name = filename.split('.')[0]
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                if '.copy' in seq.name:
                    pass
                else:
                    #pdb.set_trace()
                    #new_seq_name = seq.name.split('|')[0]
                    new_seq_name = '_'.join(
                        seq.name.split('_')[options.position:])
                    new_align.add_sequence(new_seq_name, str(seq.seq))
        #pdb.set_trace()
        outf = os.path.join(options.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        except ValueError:
            pdb.set_trace()
        print count
Example #3
0
 def _domain_alignment(self,alignment,domain_region, alignment_index):
     # Now we need to subselect the portion of the alignment 
     # that contains the domain.
     protein_record = alignment[alignment_index]
     protein_seq = str(protein_record.seq)
     # Figure out which columns encapsulate the domain.
     aa_count = 0
     column_start = None
     column_stop = None
     #print protein_seq
     for column,aa in enumerate(protein_seq):
         #print column,aa
         if aa!='-':
             aa_count=aa_count+1
         if aa_count==domain_region.start and column_start==None:
             column_start = column
         if aa_count==domain_region.stop and column_stop==None:
             column_stop = column
             break
     #print column_start,column_stop
     assert column_start != None, str(column_start)
     assert column_stop != None, str(column_stop)
     domain_alignment = Alignment(alphabet = alignment._alphabet)
     # Grab the portion of each sequence that correspond to columns
     # for the domain.
     for record in alignment:
         domain_alignment.add_sequence(record.id,
                                       str(record.seq)[column_start:column_stop])
     return (domain_alignment, column_start, column_stop)
def main():
    args = get_args()
    # iterate through all the files to determine the longest alignment
    files = get_files(args.nexus)
    old_names = set()
    for f in files:
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                old_names.update([seq.name])
    #pdb.set_trace()
    name_map = abbreviator(old_names)
    for count, f in enumerate(files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        #filename = os.path.basename(f)
        #chromo_name = filename.split('.')[0]
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                new_seq_name = name_map[seq.name]
                new_align.add_sequence(new_seq_name, str(seq.seq))
        #pdb.set_trace()
        outf = os.path.join(args.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        except ValueError:
            pdb.set_trace()
        print count
Example #5
0
 def _domain_alignment(self, alignment, domain_region, alignment_index):
     # Now we need to subselect the portion of the alignment
     # that contains the domain.
     protein_record = alignment[alignment_index]
     protein_seq = str(protein_record.seq)
     # Figure out which columns encapsulate the domain.
     aa_count = 0
     column_start = None
     column_stop = None
     #print protein_seq
     for column, aa in enumerate(protein_seq):
         #print column,aa
         if aa != '-':
             aa_count = aa_count + 1
         if aa_count == domain_region.start and column_start == None:
             column_start = column
         if aa_count == domain_region.stop and column_stop == None:
             column_stop = column
             break
     #print column_start,column_stop
     assert column_start != None, str(column_start)
     assert column_stop != None, str(column_stop)
     domain_alignment = Alignment(alphabet=alignment._alphabet)
     # Grab the portion of each sequence that correspond to columns
     # for the domain.
     for record in alignment:
         domain_alignment.add_sequence(
             record.id,
             str(record.seq)[column_start:column_stop])
     return (domain_alignment, column_start, column_stop)
Example #6
0
    def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)):
        Alignment.__init__(self, alphabet)

        # represent all of those stars in the aln output format
        self._star_info = ''
        
        self._version = ''
Example #7
0
def ace2fasta(in_file, out_file):
    ace_gen = Ace.parse(open(in_file, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "All contigs treated"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            
            # Now we have started our alignment we can add sequences to it 
            # Add concensus sequence to alignment
            align.add_sequence(contig.name, contig.sequence.replace("*",""))
            
            """for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)"""
            
            output_file.write(align.format("fasta"))
Example #8
0
def createAlignment(sequences, alphabet):
    """Create an Alignment object from a list of sequences"""
    align = Alignment(alphabet)
    counter = 0
    for sequence in sequences:
        name = "sequence" + str(counter)
        align.add_sequence(name, sequence)
        counter += 1
    return align
Example #9
0
File: oid.py Project: bsmithers/hpf
def phylip(handle):
    seqs,columns = handle.readline().split()
    from Bio.Align.Generic import Alignment
    from Bio.Alphabet import IUPAC, Gapped
    alignment = Alignment(Gapped(IUPAC.protein, "-"))
    for line in handle:
        name,seq = line.split()
        alignment.add_sequence(name, seq)
    return alignment
Example #10
0
def createAlignment(sequences, alphabet):
    """Create an Alignment object from a list of sequences"""
    align = Alignment(alphabet)
    counter = 0
    for sequence in sequences:
        name = "sequence" + str(counter)
        align.add_sequence(name, sequence)
        counter+=1
    return align
 def build_align( self, seq ):
     align = Alignment( Gapped( DNAAlphabet() ) )
     alphabet = self.alphabet
     len_seq = len( seq )
     step = self.segment_size
     for j in range( 0, len_seq, step ):
         segment = seq[j : j + step]
         align.add_sequence( name, segment )
     self.friendly = align
Example #12
0
 def testCulledColumnMapper(self):
     align = Alignment(Gapped(IUPAC.protein, "-"))
     original = "ABCDEFGHI"
     align.add_sequence("test",original)
     culled = [0,1,4,8]
     # should yield
     result = "CDFGH"
     mapper = CulledColumnMapper(align,culled)
     for i,aa in enumerate(result):
         assert original[mapper[i]]==aa
def rename(align, first, second):
    for a in align:
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        for seq in a:
            split_name = seq.id.split('_')
            #pdb.set_trace()
            if first and second:
                new_seq_name = '_'.join([split_name[first][0:3], split_name[second][0:3]])
            elif not second:
                new_seq_name = split_name[first]
            new_align.add_sequence(new_seq_name, str(seq.seq))
        yield new_align
Example #14
0
def main():
    args = get_args()
    nexus_files = get_files(args.input)
    for count, align_file in enumerate(nexus_files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        for align in AlignIO.parse(align_file, "nexus"):
            for taxon in list(align):
                if taxon.name not in args.taxa:
                    new_align.add_sequence(taxon.name, str(taxon.seq))
        outf = os.path.join(args.output, os.path.basename(align_file))
        AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        print count
def rename(align, first, second):
    for a in align:
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        for seq in a:
            split_name = seq.id.split('_')
            #pdb.set_trace()
            if first and second:
                new_seq_name = '_'.join(
                    [split_name[first][0:3], split_name[second][0:3]])
            elif not second:
                new_seq_name = split_name[first]
            new_align.add_sequence(new_seq_name, str(seq.seq))
        yield new_align
def main():
    args = get_args()
    nexus_files = get_files(args.input)
    taxa = get_all_taxon_names(nexus_files)
    taxa_to_keep = get_samples_to_run(args, taxa)
    for count, align_file in enumerate(nexus_files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        for align in AlignIO.parse(align_file, "nexus"):
            for taxon in list(align):
                if taxon.name in taxa_to_keep:
                    new_align.add_sequence(taxon.name, str(taxon.seq))
        outf = os.path.join(args.output, os.path.basename(align_file))
        AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        print count
Example #17
0
def proteins_alignment_to_biopython(al, seq1, seq2, name1, name2):
    "Convert our internal alignment format into BioPython Alignment"
    s1 = ""
    s2 = ""
    align = Alignment(Gapped(IUPAC.protein, "-"))
    for a, b in al:
        if a!=-1:
            s1 += seq1[a].upper()
        else:
            s1 += "-"
        if b!=-1:
            s2 += seq2[b].upper()
    align.add_sequence(name1, s1)
    align.add_sequence(name2, s2)
    return align
Example #18
0
def NexusIterator(handle, seq_count=None):
    """Returns SeqRecord objects from a Nexus file.

    Thus uses the Bio.Nexus module to do the hard work.

    You are expected to call this function via Bio.SeqIO or Bio.AlignIO
    (and not use it directly).

    NOTE - We only expect ONE alignment matrix per Nexus file,
    meaning this iterator will only yield one Alignment."""
    n = Nexus.Nexus(handle)
    if not n.matrix:
        #No alignment found
        raise StopIteration
    alignment = Alignment(n.alphabet)

    #Bio.Nexus deals with duplicated names by adding a '.copy' suffix.
    #The original names and the modified names are kept in these two lists:
    assert len(n.unaltered_taxlabels) == len(n.taxlabels)

    if seq_count and seq_count != len(n.unaltered_taxlabels):
        raise ValueError("Found %i sequences, but seq_count=%i" \
               % (len(n.unaltered_taxlabels), seq_count))

    for old_name, new_name in zip(n.unaltered_taxlabels, n.taxlabels):
        assert new_name.startswith(old_name)
        seq = n.matrix[new_name]  #already a Seq object with the alphabet set
        #ToDo - Can we extract any annotation too?
        #ToDo - Avoid abusing the private _records list
        alignment._records.append(
            SeqRecord(seq, id=new_name, name=old_name, description=""))
    #All done
    yield alignment
Example #19
0
 def __str__(self):
     """
     """
     outstr = _Alignment.__str__(self)
     if self._secStruct:
         outstr+='\n'+str(self._secStruct)
     return outstr
Example #20
0
 def get_alignment(self):
     """Construct an alignment from the aligned sequences in this tree."""
     def seq_is_aligned(node):
         if isinstance(node, Sequence) and node.mol_seq.is_aligned:
             return True
         return False
     seqs = self.depth_first_search(self, seq_is_aligned)
     try:
         first_seq = seqs.next()
     except StopIteration:
         warnings.warn("No aligned sequences were found in this tree.",
                 Warning, stacklevel=2)
     aln = Alignment(first_seq.get_alphabet())
     aln.add_sequence(str(first_seq), first_seq.mol_seq.value)
     for seq in seqs:
         aln.add_sequence(str(seq), seq.mol_seq.value)
     return aln
Example #21
0
    def get_column(self, col):
        """Returns a string containing a given column (OBSOLETE).

        This is a method provided for backwards compatibility with the old
        Bio.Align.Generic.Alignment object. You are encouraged to use the
        slice notation instead.
        """
        return _Alignment.get_column(self, col)
Example #22
0
 def trim_alignment(self, method = 'edges', remove_probe = None, bases = None, consensus = True, window_size = 20, threshold = 0.5):
     """Trim the alignment"""
     if method == 'edges':
         # find edges of the alignment
         start   = self._find_ends(forward=True)
         end     = self._find_ends(forward=False)
     elif method == 'running':
         start, end = self.running_average(window_size, threshold)
     # create a new alignment object to hold our alignment
     self.trimmed_alignment = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
     for sequence in self.alignment:
         # ignore the probe sequence we added
         if (method == 'edges' or method == 'running') and not remove_probe:
             # it is totally retarded that biopython only gives us the option to
             # pass the Alignment object a name and str(sequence).  Given this 
             # level of retardation, we'll fudge and use their private method
             self.trimmed_alignment._records.append(sequence[start:end])  
         elif method == 'static' and not remove_probe and bases:
             # get middle of alignment and trim out from that - there's a
             # weakness here in that we are not actually locating the probe
             # region, we're just locating the middle of the alignment
             mid_point = len(sequence)/2
             if self._base_checker(bases, sequence, mid_point):
                 self.trimmed_alignment._records.append(
                     sequence[mid_point-bases:mid_point+bases]
                     )
             else:
                 self.trimmed_alignment = None
         elif method == 'static' and not remove_probe and bases and self.ploc:
             # get middle of alignment and trim out from that - there's a
             # weakness here in that we are not actually locating the probe
             # region, we're just locating the middle of the alignment
             if self._base_checker(bases, sequence, self.ploc):
                 self.trimmed_alignment._records.append(
                     sequence[self.ploc[0]-bases:self.ploc[1]+bases]
                     )
             else:
                 self.trimmed_alignment = None
         elif remove_probe and self.ploc:
             # we have to drop to sequence level to add sequence slices
             # where we basically slice around the probes location
             temp = sequence.seq[:self.ploc[0]] + sequence.seq[self.ploc[1]:]
             self.trimmed_alignment._records.append( \
                 self._record_formatter(temp)
                 )
         elif method == 'static' and remove_probe and bases and self.ploc:
             if self._base_checker(bases, sequence, self.ploc):
                 temp = sequence.seq[self.ploc[0]-bases:self.ploc[0]] + \
                     sequence.seq[self.ploc[1]:self.ploc[1]+bases]
                 self.trimmed_alignment._records.append( \
                     self._record_formatter(temp)
                     )
             else:
                 self.trimmed_alignment = None
     # build a dumb consensus
     if consensus:
         self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \
             self._alignment_summary(self.trimmed_alignment)
Example #23
0
def ace2fasta(in_file, out_file):
    ace_gen = Ace.parse(open(in_file, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "All contigs treated"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))

            # Now we have started our alignment we can add sequences to it
            # Add concensus sequence to alignment
            align.add_sequence(contig.name, contig.sequence)

            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)

            output_file.write(align.format("fasta"))
Example #24
0
 def get_alignment(self):
     """Construct an alignment from the aligned sequences in this tree."""
     def is_aligned_seq(node):
         if isinstance(node, Sequence) and node.mol_seq.is_aligned:
             return True
         return False
     seqs = self._filter_search(is_aligned_seq, 'preorder', True)
     try:
         first_seq = seqs.next()
     except StopIteration:
         # No aligned sequences were found
         # Can't construct an Alignment without an alphabet, so... nothin'
         return
     aln = Alignment(first_seq.get_alphabet())
     aln.add_sequence(str(first_seq), first_seq.mol_seq.value)
     for seq in seqs:
         aln.add_sequence(str(seq), seq.mol_seq.value)
     return aln
def strarray2biopy(align):
    """ take a 2d character array with an associated ID list 
    and convert it into a biopython DNA alignment."""
        
    seqs = align[0]
    ids = align[1]

    alphabet = Gapped(IUPAC.unambiguous_dna) 
    alignment = Alignment(alphabet) 
       
    for count, array_seq in enumerate(seqs):
        bases = ''
            
        for base in array_seq:
            bases += base
                
        alignment.add_sequence(ids[count],bases)
        
    return alignment
Example #26
0
    def get_column(self, col):
        """Returns a string containing a given column (DEPRECATED).

        This is a method provided for backwards compatibility with the old
        Bio.Align.Generic.Alignment object. Please use the slice notation
        instead, since get_column is likely to be removed in a future release
        of Biopython..
        """
        import warnings
        import Bio
        warnings.warn("This method is deprecated and is provided for backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the slice notation instead, as get_column is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning)
        return _Alignment.get_column(self, col)
Example #27
0
    def get_column(self, col):
        """Returns a string containing a given column (DEPRECATED).

        This is a method provided for backwards compatibility with the old
        Bio.Align.Generic.Alignment object. Please use the slice notation
        instead, since get_column is likely to be removed in a future release
        of Biopython..
        """
        import warnings
        import Bio
        warnings.warn("This method is deprecated and is provided for backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the slice notation instead, as get_column is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning)
        return _Alignment.get_column(self, col)
Example #28
0
def gene_expression_2matrix(in_ace, out_file, tags, min_seq):
    """Count sequences with each tags in all contigs.
    
    """
    print
    print "USING MATRIX OUTPUT FORMAT"
    print
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("gene_name\tgene_length")
        for tag in tags:
            output_file.write("\t" + tag)
        output_file.write("\tXX_noTag")
        output_file.write("\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta_2list(align.format("fasta"))
            if len(sequences) < min_seq:
                continue
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            contig_seq = sequences[0][1].replace("*", "")
            contig_length = str(len(contig_seq))
            output_file.write(contig_name + "\t" + contig_length)
            print "Treating", contig_name
            d = defaultdict(int)
            for tag in tags:
                d[tag] = 0
            d["XX_noTag"] = 0
            fasta_counter = 0
            for fasta in sequences:
                fasta_counter += 1
                found_tag = 0
                for tag in tags:
                    if fasta[0].find(tag) > -1:
                        d[tag] += 1
                        found_tag = 1
                if found_tag == 0 and fasta[0].find("Consensus") < 0:
                    d["XX_noTag"] += 1
            for tag in sorted(d):
                output_file.write("\t" + str(d[tag]))
            output_file.write("\n")
Example #29
0
def formatData (AlignData, Score):
	LIMIT1 = 450
	LIMIT2 = 2000
	i = 0;
	ScorePoints = []
        for i in xrange(6196):
                ScorePoints.append(0)
	i = 0
	for record in AlignData.Alignment:
	        #print "Here"
		j = 0
		for c in record.seq.tostring():
			if (Score[j] <= LIMIT1):
				if c != '-':
					ScorePoints[i] -= 2
			if (Score[j] >= LIMIT2):
				if c != '-':
					ScorePoints[i] += 2
				else:
					ScorePoints[i] -= 1		
				#NewAlignData.add_sequence(record.seq.tostring(),record.id)
			j += 1
					
		i+=1
#	return NewAlignData
#	return ScorePoints
	i = 0
	DataList = list()
	for record in AlignData.Alignment:
		if(ScorePoints[i] >= -250):
			NewAlignData = Alignment(Gapped(IUPAC.protein,"-"))
			NewAlignData.add_sequence(record.id,record.seq.tostring())
			DataList.append(NewAlignData)
		i+=1

	return DataList
def main():
    options, args = interface()
    # iterate through all the files to determine the longest alignment
    files = get_files(options.input)
    for count, f in enumerate(files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        # filename = os.path.basename(f)
        # chromo_name = filename.split('.')[0]
        for align in AlignIO.parse(f, "nexus"):
            for seq in list(align):
                if ".copy" in seq.name:
                    pass
                else:
                    # pdb.set_trace()
                    # new_seq_name = seq.name.split('|')[0]
                    new_seq_name = "_".join(seq.name.split("_")[options.position :])
                    new_align.add_sequence(new_seq_name, str(seq.seq))
        # pdb.set_trace()
        outf = os.path.join(options.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, "w"), "nexus")
        except ValueError:
            pdb.set_trace()
        print count
def gene_expression_2matrix(in_ace, out_file, tags, min_seq):
    """Count sequences with each tags in all contigs.
    
    """
    print
    print "USING MATRIX OUTPUT FORMAT"
    print
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("gene_name\tgene_length")
        for tag in tags:
            output_file.write("\t" + tag)
        output_file.write("\tXX_noTag")
        output_file.write("\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta_2list(align.format("fasta"))
            if len(sequences) < min_seq:
                continue
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            contig_seq = sequences[0][1].replace("*", "")
            contig_length = str(len(contig_seq))
            output_file.write(contig_name + "\t" + contig_length)
            print "Treating", contig_name
            d = defaultdict(int)
            for tag in tags:
                d[tag] = 0
            d["XX_noTag"] = 0
            fasta_counter = 0
            for fasta in sequences:
                fasta_counter += 1
                found_tag = 0
                for tag in tags:
                    if fasta[0].find(tag) > -1:
                        d[tag] += 1
                        found_tag = 1
                if found_tag == 0 and fasta[0].find("Consensus") < 0:
                    d["XX_noTag"] += 1
            for tag in sorted(d):
                output_file.write("\t" + str(d[tag]))
            output_file.write("\n")
Example #32
0
def getHaplotypes(aln, n=10, fmin=0.0):
    """Get the haplotypes of the aligment aln.
    """
    count = {}
    from Bio.Align.Generic import Alignment
    haplotypes = Alignment(alphabet)
    for record in aln:
        count[record.seq.tostring()] = count.get(record.seq.tostring(), 0) + 1
    for i, seq in enumerate(
            sorted(count.keys(), key=lambda x: count[x], reverse=True)[:n]):
        f = count[seq] / float(len(aln))
        if f > fmin:
            haplotypes._records.append(
                Bio.SeqIO.SeqRecord(Bio.Seq.Seq(seq, alphabet),
                                    id="Hap%04i" % (i + 1),
                                    name="%f" % (f)))
    return haplotypes
 def trim_ambiguous_bases(self):
     """snip ambiguous bases from a trimmed_alignment"""
     ambiguous_bases = []
     # do this by finaing all ambiguous bases and then snipping the largest
     # chunk with no ambiguous bases from the entire alignment
     if not self.trimmed_alignment:
         self.perfect_trimmed_alignment = self.trimmed_alignment
     else:
         for column in xrange(
                 0, self.trimmed_alignment.get_alignment_length()):
             if 'N' in self.trimmed_alignment.get_column(column):
                 ambiguous_bases.append(column)
         maximum = 0
         maximum_pos = None
         #pdb.set_trace()
         if not ambiguous_bases:
             self.perfect_trimmed_alignment = self.trimmed_alignment
         if ambiguous_bases:
             # prepend and append the start and end of the sequence so consider
             # those chunks outside the stop and start of ambiguous base runs.
             ambiguous_bases.insert(0, 0)
             ambiguous_bases.append(
                 self.trimmed_alignment.get_alignment_length() - 1)
             # create a new alignment object to hold our alignment
             self.perfect_trimmed_alignment = \
                 Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
             for pos in xrange(len(ambiguous_bases)):
                 if pos + 1 < len(ambiguous_bases):
                     difference = ambiguous_bases[pos + 1] - \
                         ambiguous_bases[pos]
                     if difference > maximum:
                         maximum = difference
                         maximum_pos = (pos, pos + 1)
                 else:
                     pass
             # make sure we catch cases where there is not best block
             if maximum_pos:
                 for sequence in self.trimmed_alignment:
                     self.perfect_trimmed_alignment._records.append(
                         sequence[ambiguous_bases[maximum_pos[0]] +
                                  1:ambiguous_bases[maximum_pos[1]]])
             else:
                 self.perfect_trimmed_alignment = None
Example #34
0
def bam2Alignment(samfile,
                  chrom=None,
                  start=None,
                  stop=None,
                  minlen=1,
                  out=sys.stdout):
    """Read alignment from samfile and return Alignment object.
    """
    iter = samfile.fetch(chrom, start, stop)
    from Bio.Align.Generic import Alignment
    from Bio.Alphabet import IUPAC, Gapped
    alphabet = Gapped(IUPAC.ambiguous_dna)
    aln = Alignment(alphabet)
    for read in iter:
        soft_clipped = sum([op[1] for op in read.cigar if op[0] in (4, 1)])
        #print soft_clipped, read_cigar
        if read.rlen - start + read.pos + 1 > minlen + soft_clipped and stop - read.pos + 1 >= minlen + soft_clipped:
            aln._records.append(getSeqRecord(read, start=start, stop=stop))
    return aln
Example #35
0
class Record:
    """Hold Saf information in a format similar to the original record.

    The Record class is meant to make data easy to get to when you are
    just interested in looking at Saf data.

    Attributes:
    alignment

    """
    def __init__(self):
        self.alignment = Alignment( Bio.Alphabet.generic_alphabet )

    def __str__( self ):
        output = ''
        sequences = self.alignment.get_all_seqs()
        for sequence_record in sequences:
            output = output + '%s\n' % sequence_record.description
            output = output + out_sequence( sequence_record.seq.data )
        return output
Example #36
0
class Record:
    """Hold Saf information in a format similar to the original record.

    The Record class is meant to make data easy to get to when you are
    just interested in looking at Saf data.

    Attributes:
    alignment

    """
    def __init__(self):
        self.alignment = Alignment(Bio.Alphabet.generic_alphabet)

    def __str__(self):
        output = ''
        sequences = self.alignment.get_all_seqs()
        for sequence_record in sequences:
            output = output + '%s\n' % sequence_record.description
            output = output + out_sequence(sequence_record.seq.data)
        return output
def parse_ace(ace_file):
	ace_gen = Ace.parse(open(ace_file, 'r'))
	contig = ace_gen.next()
	align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
	align.add_sequence(contig.name, contig.sequence)
 
	for readn in range(len(contig.reads)):
		clipst = contig.reads[readn].qa.qual_clipping_start
		clipe = contig.reads[readn].qa.qual_clipping_end
		start = contig.af[readn].padded_start
		seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)

		seq = pad_read(seq, start, len(contig.sequence))
		align.add_sequence(contig.reads[readn].rd.name + "_" + contig.af[readn].coru, seq)

	return contig, align
def add_gaps_to_align(organisms,
                      missing,
                      align,
                      verbatim=False,
                      genera=False,
                      min_taxa=3):
    local_organisms = copy.deepcopy(organisms)
    for a in align:
        if len(a) < min_taxa:
            new_align = None
            break
        elif len(a) >= min_taxa:
            #pdb.set_trace()
            new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
            overall_length = len(a[0])
            for seq in a:
                if genera and any(sp for sp in genera if sp in seq.name):
                    new_seq_name = '_'.join(seq.name.split('_')[-1:])
                elif not verbatim:
                    new_seq_name = '_'.join(seq.name.split('_')[-2:])
                else:
                    new_seq_name = seq.name.lower()
                new_align.add_sequence(new_seq_name, str(seq.seq))
                local_organisms.remove(new_seq_name)
            for org in local_organisms:
                if genera and any(sp for sp in genera if sp in seq.name):
                    loc = '_'.join(seq.name.split('_')[:-1])
                elif not verbatim:
                    loc = '_'.join(seq.name.split('_')[:-2])
                else:
                    loc = seq.name
                if missing:
                    try:
                        assert loc in missing[org], "Locus missing"
                    except:
                        assert loc in missing['{}*'.format(
                            org)], "Locus missing"
                new_align.add_sequence(org, '?' * overall_length)
    return new_align
Example #39
0
# standard library
import os 

# biopython
from Bio import Alphabet
from Bio import Seq
from Bio.Alphabet import IUPAC
from Bio import Clustalw
from Bio.Align import AlignInfo
from Bio import AlignIO
from Bio.SubsMat import FreqTable
from Bio.Align.Generic import Alignment

#Very simple tests on an empty alignment
alignment = Alignment(Alphabet.generic_alphabet)
assert alignment.get_alignment_length() == 0
assert len(alignment) == 0
del alignment

#Basic tests on simple three string alignment
alignment = Alignment(Alphabet.generic_alphabet)
letters = "AbcDefGhiJklMnoPqrStuVwxYz"
alignment.add_sequence("mixed", letters)
alignment.add_sequence("lower", letters.lower())
alignment.add_sequence("upper", letters.upper())
assert alignment.get_alignment_length() == 26
assert len(alignment) == 3
assert alignment.get_seq_by_num(0).tostring() == letters
assert alignment.get_seq_by_num(1).tostring() == letters.lower()
assert alignment.get_seq_by_num(2).tostring() == letters.upper()
Example #40
0
    consensus = summary.gap_consensus(ambiguous="N")
    print consensus
    print
    print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                            axis_seq=consensus)
    print
    #Have a generic alphabet, without a declared gap char, so must tell
    #provide the frequencies and chars to ignore explicitly.
    print summary.information_content(e_freq_table=expected,
                                      chars_to_ignore=['-'])
    print
    print "Trying a protein sequence with gaps and stops"

    alpha = Alphabet.HasStopCodon(
        Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
    a = Alignment(alpha)
    a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
    a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
    a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
    print a
    print "=" * a.get_alignment_length()

    s = SummaryInfo(a)
    c = s.dumb_consensus(ambiguous="X")
    print c
    c = s.gap_consensus(ambiguous="X")
    print c
    print
    print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)

    print s.information_content(chars_to_ignore=['-', '*'])
 def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)):
     Alignment.__init__(self, alphabet)
Example #42
0
    def next(self):
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()

        if not line: return
        line = line.strip()
        parts = filter(None, line.split())
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        #Expects STRICT truncation/padding to 10 characters
        #Does not require any white space between name and seq.
        for i in range(0, number_of_seqs):
            line = handle.readline().rstrip()
            ids.append(line[:10].strip())  #first ten characters
            seqs.append([line[10:].strip().replace(" ", "")])

        #Look for further blocks
        line = ""
        while True:
            #Skip any blank lines between blocks...
            while "" == line.strip():
                line = handle.readline()
                if not line: break  #end of file
            if not line: break  #end of file

            if self._is_header(line):
                #Looks like the start of a concatenated alignment
                self._header = line
                break

            #print "New block..."
            for i in range(0, number_of_seqs):
                seqs[i].append(line.strip().replace(" ", ""))
                line = handle.readline()
                if (not line) and i + 1 < number_of_seqs:
                    raise ValueError("End of file mid-block")
            if not line: break  #end of file

        alignment = Alignment(self.alphabet)
        for i in range(0, number_of_seqs):
            seq = "".join(seqs[i])
            if len(seq) != length_of_seqs:
                raise ValueError("Sequence %i length %i, expected length %i" \
                                  % (i+1, len(seq), length_of_seqs))
            alignment.add_sequence(ids[i], seq)

            record = alignment.get_all_seqs()[-1]
            assert ids[i] == record.id or ids[i] == record.description
            record.id = ids[i]
            record.name = ids[i]
            record.description = ids[i]
        return alignment
Example #43
0
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage,
                   stars, ngroups, nhaplo):
    """Get haplotypes from contigs in an ace file
    
    """
    marker_number = 0
    min_freq = 0.05
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        with open(out_bamova, "w") as bamova_file:
            output_file.write("Contig_nb\tWindow\tHaplotype\n")
            contig_counter = 0
            ntreated = 0
            for contig in ace_gen:
                pass_haplo = False
                contig_counter += 1
                align = Alignment(Gapped(IUPAC.ambiguous_dna, "X"))
                align.add_sequence(contig.name, contig.sequence)
                if len(contig.reads) - 1 < coverage:
                    continue
                ntreated += 1
                for readn in xrange(len(contig.reads)):
                    clipst = contig.reads[readn].qa.qual_clipping_start
                    clipe = contig.reads[readn].qa.qual_clipping_end
                    clipst2 = contig.reads[readn].qa.align_clipping_start
                    clipe2 = contig.reads[readn].qa.align_clipping_end
                    if clipst2 > clipst:
                        clipst = clipst2
                    if clipe2 < clipe2:
                        clipe = clipe2
                    start = contig.af[readn].padded_start
                    seq = cut_ends(contig.reads[readn].rd.sequence, clipst,
                                   clipe)
                    seq = pad_read(seq, start, len(contig.sequence))
                    if "pseudo" not in contig.reads[readn].rd.name:
                        align.add_sequence(contig.reads[readn].rd.name, seq)
                sequences = read_fasta(align.format("fasta"))
                sequences = [[s[0].replace(">", ""), s[1]] for s in sequences]
                contig_name = sequences[0][0]
                concensus = sequences[0][1]
                error_positions = multi_find("*", concensus)[::-1]
                for p in error_positions:
                    sequences = [[s[0], s[1][0:p] + s[1][p + 1:]]
                                 for s in sequences]
                concensus = sequences[0][1]
                sequences = [[s[0], correct_sequence(concensus, s[1])]
                             for s in sequences[1:]]
                sequences, snp_pos = snp_positions(sequences)
                haplotypes = best_snps(sequences, snp_pos, coverage)
                if haplotypes != "Empty":
                    bamova = []
                    variants = list(
                        sorted(list(set([h[-1] for h in haplotypes[-1]]))))
                    groups = list(
                        sorted(set([h[0][:3] for h in haplotypes[-1]])))
                    if len(groups) >= ngroups:
                        pass_haplo = True
                        for g in groups:
                            if len([
                                    h[0] for h in haplotypes[-1]
                                    if h[0].startswith(g)
                            ]) < nhaplo:
                                pass_haplo = False
                    if pass_haplo:
                        print contig.name
                        bamova_file.write("Marker" + str(marker_number) + "\n")
                        group_number = 0
                        for g in groups:
                            bamova_file.write("Population\t" +
                                              str(group_number))
                            group_number += 1
                            for v in variants:
                                bamova_file.write("\t" + str(
                                    len([
                                        h for h in haplotypes[-1]
                                        if h[-1] == v and h[0].startswith(g)
                                    ])))
                            bamova_file.write("\n")
                        with open("fasta_output/" + contig.name + ".fasta",
                                  "w") as f:
                            output_file.write(contig.name + "\n")
                            for h in haplotypes[-1]:
                                f.write(">" + h[0] + str(marker_number) +
                                        "\n" + h[2] + "\n")
                                h[1] = [x - h[1][0] + 1 for x in h[1]]
                                output_file.write(
                                    "Marker" + str(marker_number) + "\t" +
                                    "\t".join([str(x) for x in h]) + "\t" +
                                    ":".join(variants) + "\n")
                        marker_number += 1
                output_file.flush()
                bamova_file.flush()
                cutoff = 100000
                if contig_counter > cutoff:
                    break
        print "\n", str(ntreated), "contigs out of", str(
            contig_counter), "were treated"
Example #44
0
    def next(self):
        try:
            line = self._header
            del self._header
        except AttributeError:
            line = self.handle.readline()
        if not line:
            # Empty file - just give up.
            return
        if not line.strip() == "# STOCKHOLM 1.0":
            raise ValueError("Did not find STOCKHOLM header")
            # import sys
            # print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0'

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while 1:
            line = self.handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == "# STOCKHOLM 1.0":
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier " + "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, "")
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == "#=GC ":
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == "#=GS ":
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids :
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids :
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip()  # append to any previous entry
                    # TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None and self.records_per_alignment != len(ids):
                raise ValueError(
                    "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)
                )

            alignment = Alignment(self.alphabet)

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            alignment_length = len(seqs.values()[0])
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError("Sequences have different lengths, or repeated identifier")
                name, start, end = self._identifier_split(id)
                alignment.add_sequence(id, seq, start=start, end=end)

                record = alignment.get_all_seqs()[-1]

                assert record.id == id or record.description == id

                record.id = id
                record.name = name
                record.description = id

                # will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                self._populate_meta_data(id, record)
            return alignment
        else:
            return None
Example #45
0
# standard library
import os 

# biopython
from Bio import Alphabet
from Bio import Seq
from Bio.Alphabet import IUPAC
from Bio import Clustalw
from Bio.Align.FormatConvert import FormatConverter
from Bio.Align import AlignInfo
from Bio.Fasta import FastaAlign
from Bio.SubsMat import FreqTable
from Bio.Align.Generic import Alignment

#Very simple tests on an empty alignment
alignment = Alignment(Alphabet.generic_alphabet)
assert alignment.get_alignment_length() == 0
assert alignment.get_all_seqs() == []
del alignment

#Basic tests on simple three string alignment
alignment = Alignment(Alphabet.generic_alphabet)
letters = "AbcDefGhiJklMnoPqrStuVwxYz"
alignment.add_sequence("mixed", letters)
alignment.add_sequence("lower", letters.lower())
alignment.add_sequence("upper", letters.upper())
assert alignment.get_alignment_length() == 26
assert len(alignment.get_all_seqs()) == 3
assert alignment.get_seq_by_num(0).tostring() == letters
assert alignment.get_seq_by_num(1).tostring() == letters.lower()
assert alignment.get_seq_by_num(2).tostring() == letters.upper()
Example #46
0
    def next(self):
        try:
            line = self._header
            del self._header
        except AttributeError:
            line = self.handle.readline()
        if not line:
            #Empty file - just give up.
            return
        if not line.strip() == '# STOCKHOLM 1.0':
            raise ValueError("Did not find STOCKHOLM header")
            #import sys
            #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0'

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while 1:
            line = self.handle.readline()
            if not line: break  #end of file
            line = line.strip()  #remove trailing \n
            if line == '# STOCKHOLM 1.0':
                self._header = line
                break
            elif line == "//":
                #The "//" line indicates the end of the alignment.
                #There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                #blank line, ignore
                pass
            elif line[0] != "#":
                #Sequence
                #Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    #This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier " \
                                      + "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, '')
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                #Comment line or meta-data
                if line[:5] == "#=GF ":
                    #Generic per-File annotation, free text
                    #Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    #Each feature key could be used more than once,
                    #so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == '#=GC ':
                    #Generic per-Column annotation, exactly 1 char per column
                    #Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == '#=GS ':
                    #Generic per-Sequence annotation, free text
                    #Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    #if id not in ids :
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    #Generic per-Sequence AND per-Column markup
                    #Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    #if id not in ids :
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip(
                    )  # append to any previous entry
                    #TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            #Next line...

        assert len(seqs) <= len(ids)
        #assert len(gs)   <= len(ids)
        #assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None \
            and self.records_per_alignment != len(ids) :
                raise ValueError("Found %i records in this alignment, told to expect %i" \
                                 % (len(ids), self.records_per_alignment))

            alignment = Alignment(self.alphabet)

            #TODO - Introduce an annotated alignment class?
            #For now, store the annotation a new private property:
            alignment._annotations = gr

            alignment_length = len(seqs.values()[0])
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )
                name, start, end = self._identifier_split(id)
                alignment.add_sequence(id, seq, start=start, end=end)

                record = alignment.get_all_seqs()[-1]

                assert record.id == id or record.description == id

                record.id = id
                record.name = name
                record.description = id

                #will be overridden by _populate_meta_data if an explicit
                #accession is provided:
                record.annotations["accession"] = name

                self._populate_meta_data(id, record)
            return alignment
        else:
            return None
Example #47
0
def to_alignment(sequences, alphabet=None, strict=True):
    """Returns a multiple sequence alignment (OBSOLETE).

     - sequences -An iterator that returns SeqRecord objects,
                  or simply a list of SeqRecord objects.  All
                  the record sequences must be the same length.
     - alphabet - Optional alphabet.  Stongly recommended.
     - strict   - Optional, defaults to True.  Should error checking
                  be done?

    Using this function is now discouraged.  Rather doing this:

    >>> from Bio import SeqIO
    >>> handle = open("Clustalw/protein.aln")
    >>> alignment = SeqIO.to_alignment(SeqIO.parse(handle, "clustal"))
    >>> handle.close()

    You are now encouraged to use Bio.AlignIO instead, e.g.

    >>> from Bio import AlignIO
    >>> handle = open("Clustalw/protein.aln")
    >>> alignment = AlignIO.read(handle, "clustal")
    >>> handle.close()
    """
    #TODO - Move this functionality into the Alignment class instead?
    from Bio.Alphabet import generic_alphabet
    from Bio.Alphabet import _consensus_alphabet
    if alphabet is None:
        sequences = list(sequences)
        alphabet = _consensus_alphabet([rec.seq.alphabet for rec in sequences \
                                        if rec.seq is not None])

    if not (isinstance(alphabet, Alphabet)
            or isinstance(alphabet, AlphabetEncoder)):
        raise ValueError("Invalid alphabet")

    alignment_length = None
    alignment = Alignment(alphabet)
    for record in sequences:
        if strict:
            if alignment_length is None:
                alignment_length = len(record.seq)
            elif alignment_length != len(record.seq):
                raise ValueError("Sequences must all be the same length")

            assert isinstance(record.seq.alphabet, Alphabet) \
            or isinstance(record.seq.alphabet, AlphabetEncoder), \
                "Sequence does not have a valid alphabet"

            #TODO - Move this alphabet comparison code into the Alphabet module/class?
            #TODO - Is a normal alphabet "ungapped" by default, or does it just mean
            #undecided?
            if isinstance(record.seq.alphabet, Alphabet) \
            and isinstance(alphabet, Alphabet):
                #Comparing two non-gapped alphabets
                if not isinstance(record.seq.alphabet, alphabet.__class__):
                    raise ValueError("Incompatible sequence alphabet " \
                                     + "%s for %s alignment" \
                                     % (record.seq.alphabet, alphabet))
            elif isinstance(record.seq.alphabet, AlphabetEncoder) \
            and isinstance(alphabet, Alphabet):
                raise ValueError(
                    "Sequence has a gapped alphabet, alignment does not")
            elif isinstance(record.seq.alphabet, Alphabet) \
            and isinstance(alphabet, Gapped):
                #Sequence isn't gapped, alignment is.
                if not isinstance(record.seq.alphabet,
                                  alphabet.alphabet.__class__):
                    raise ValueError("Incompatible sequence alphabet " \
                                     + "%s for %s alignment" \
                                     % (record.seq.alphabet, alphabet))
            else:
                #Comparing two gapped alphabets
                if not isinstance(record.seq.alphabet, alphabet.__class__):
                    raise ValueError("Incompatible sequence alphabet " \
                                     + "%s for %s alignment" \
                                     % (record.seq.alphabet, alphabet))
                if record.seq.alphabet.gap_char != alphabet.gap_char:
                    raise ValueError(
                        "Sequence gap characters != alignment gap char")
            #ToDo, additional checks on the specified alignment...
            #Should we look at the alphabet.contains() method?
        if record.seq is None:
            raise TypeError("SeqRecord (id=%s) has None for its sequence." %
                            record.id)

        #This is abusing the "private" records list,
        #we should really have a method like add_sequence
        #but which takes SeqRecord objects.  See also Bug 1944
        alignment._records.append(record)
    return alignment
Example #48
0
    def next(self):

        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            return None
        if line[:7] <> 'CLUSTAL':
            raise ValueError("Did not find CLUSTAL header")

        #There should be two blank lines after the header line
        line = handle.readline()
        while line.strip() == "":
            line = handle.readline()

        #If the alignment contains entries with the same sequence
        #identifier (not a good idea - but seems possible), then this
        #dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []

        #Use the first block to get the sequence identifiers
        while line.strip() <> "":
            if line[0] <> " ":
                #Sequences identifier...
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                ids.append(fields[0])
                seqs.append(fields[1])

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(fields[1].replace("-", "")) <> letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
            else:
                #Sequence consensus line...
                pass
            line = handle.readline()
            if not line: break  #end of file

        assert line.strip() == ""

        #Loop over any remaining blocks...
        done = False
        while not done:
            #There should be a blank line between each block.
            #Also want to ignore any consensus line from the
            #previous block.
            while (not line) or line.strip() == "" or line[0] == " ":
                line = handle.readline()
                if not line: break  # end of file
            if not line: break  # end of file

            for i in range(len(ids)):
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    if line[:7] == 'CLUSTAL':
                        #Found concatenated alignment.
                        done = True
                        self._header = line
                        break
                    else:
                        raise ValueError("Could not parse line:\n%s" % line)

                if fields[0] <> ids[i]:
                    raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \
                                      % (fields[0], ids[i]))

                #Append the sequence
                seqs[i] += fields[1]

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(seqs[i].replace("-", "")) <> letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)

                #Read in the next line
                line = handle.readline()

        assert len(ids) == len(seqs)
        if len(seqs) == 0 or len(seqs[0]) == 0:
            return None

        if self.records_per_alignment is not None \
        and self.records_per_alignment <> len(ids) :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (len(ids), self.records_per_alignment))

        alignment = Alignment(self.alphabet)
        alignment_length = len(seqs[0])
        for i in range(len(ids)):
            if len(seqs[i]) <> alignment_length:
                raise ValueError(
                    "Error parsing alignment - sequences of different length?")
            alignment.add_sequence(ids[i], seqs[i])
        return alignment
Example #49
0
 def __init__(self):
     Alignment.__init__(self, Gapped(IUPAC.unambiguous_dna, '-'))
Example #50
0
    def next(self):

        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            return None

        while line.rstrip() <> "#=======================================":
            line = handle.readline()
            if not line:
                return None

        length_of_seqs = None
        number_of_seqs = None
        ids = []
        seqs = []

        while line[0] == "#":
            #Read in the rest of this alignment header,
            #try and discover the number of records expected
            #and their length
            parts = line[1:].split(":", 1)
            key = parts[0].lower().strip()
            if key == "aligned_sequences":
                number_of_seqs = int(parts[1].strip())
                assert len(ids) == 0
                # Should now expect the record identifiers...
                for i in range(number_of_seqs):
                    line = handle.readline()
                    parts = line[1:].strip().split(":", 1)
                    assert i + 1 == int(parts[0].strip())
                    ids.append(parts[1].strip())
                assert len(ids) == number_of_seqs
            if key == "length":
                length_of_seqs = int(parts[1].strip())

            #And read in another line...
            line = handle.readline()

        if number_of_seqs is None:
            raise SyntaxError("Number of sequences missing!")
        if length_of_seqs is None:
            raise SyntaxError("Length of sequences missing!")

        if self.records_per_alignment is not None \
        and self.records_per_alignment <> number_of_seqs :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (number_of_seqs, self.records_per_alignment))

        seqs = ["" for id in ids]
        index = 0

        #Parse the seqs
        while line:
            if len(line) > 21:
                id_start = line[:21].strip().split(None, 1)
                seq_end = line[21:].strip().split(None, 1)
                if len(id_start) == 2 and len(seq_end) == 2:
                    #identifier, seq start position, seq, seq end position
                    #(an aligned seq is broken up into multiple lines)
                    id, start = id_start
                    seq, end = seq_end

                    #The identifier is truncated...
                    assert 0 <= index and index < number_of_seqs, \
                           "Expected index %i in range [0,%i)" \
                           % (index, number_of_seqs)
                    assert id == ids[index] or id == ids[index][:len(id)]

                    #Check the start...
                    assert int(start) - 1 == len(seqs[index].replace("-","")), \
                        "Found %i chars so far for %s, file says start %i:\n%s" \
                            % (len(seqs[index]), id, int(start), seqs[index])

                    seqs[index] += seq

                    #Check the end ...
                    assert int(end) == len(seqs[index].replace("-","")), \
                        "Found %i chars so far for %s, file says end %i:\n%s" \
                            % (len(seqs[index]), id, int(end), seqs[index])

                    index += 1
                    if index >= number_of_seqs:
                        index = 0
                else:
                    #just a start value, this is just alignment annotation (?)
                    #print "Skipping: " + line.rstrip()
                    pass
            elif line.strip() == "":
                #Just a spacer?
                pass
            else:
                print line
                assert False

            line = handle.readline()
            if line.rstrip() == "#---------------------------------------" \
            or line.rstrip() == "#=======================================" :
                #End of alignment
                self._header = line
                break

        assert index == 0

        if self.records_per_alignment is not None \
        and self.records_per_alignment <> len(ids) :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (len(ids), self.records_per_alignment))

        alignment = Alignment(self.alphabet)
        for id, seq in zip(ids, seqs):
            if len(seq) <> length_of_seqs:
                raise SyntaxError(
                    "Error parsing alignment - sequences of different length?")
            alignment.add_sequence(id, seq)
        return alignment
Example #51
0
    def next(self) :
        """Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an Alignment object containing two rows."""

        handle = self.handle
        try :
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            print self._header.strip(), '--> self_header'
            del self._header
        except AttributeError:      
            line = handle.readline()
        if not line:
            return None

        if line.startswith('#-') :
            #Reached the end of the alignments, no need to read the footer...
            return None
        if line.startswith("##") :
            #Skip the file header before the alignments.  e.g.
#            print line.strip()
            line = self._skip_file_header(line)
#        print 'Back from file header skip'
        assert line.startswith('#'), line

        while not line.startswith('#=') :
            line = self.handle.readline()

        if line.startswith('#='):
            #Moved onto the next query sequence!
            self._query_descr = ""
            self._query_header_annotation = {}
            #Read in the query header
            line = self._parse_query_header(line)
        if not line :
            #End of file
            return None

        
        assert line.startswith(">>") and not line.startswith(">>>"), line

        query_seq_parts, match_seq_parts = [], []
        query_annotation, match_annotation = {}, {}
        match_descr = ""
        alignment_annotation = {}

        #This should be followed by the target match numbering line, then more tags.
        #e.g.
        """
        >>#2
        ; sw_score: 41.0
        ; sw_ident: 0.846
        ; sw_overlap: 13
        """
        
        if not line.startswith(">>") and not line.startswith(">>>") :
            raise ValueError("Expected target line starting '>>'")
        match_descr = line[2:].strip()
        #print match_descr, 'match'
        #Handle the following "alignment hit" tagged data, e.g.
        line = handle.readline()
        line = self._parse_tag_section(line, alignment_annotation)
        assert not line.startswith("; ")

        #Then we have the alignment numbers and sequence for the query
        """
        >gi|10955265| ..
        ; sq_len: 346
        ; sq_offset: 1
        ; sq_type: p
        ; al_start: 197
        ; al_stop: 238
        ; al_display_start: 167
        DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
        QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
        GEYFTENKPKYIIREIHQET
        """
        if not (line.startswith(">") and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..'")
        assert self._query_descr.startswith(line[1:].split()[0])
        
        #Handle the following "query alignment" tagged data
        line = handle.readline()
        line = self._parse_tag_section(line, query_annotation)
        assert not line.startswith("; ")

        #Now should have the aligned query sequence (with leading flanking region)
        while not line.startswith(">") :
            query_seq_parts.append(line.strip())
            line = handle.readline()
#            print 'queryseq', line.strip()
        #Handle the following "match alignment" data
        """
        >gi|152973545|ref|YP_001338596.1| ..
        ; sq_len: 242
        ; sq_type: p
        ; al_start: 52
        ; al_stop: 94
        ; al_display_start: 22
        IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
        RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
        QDFAFTRKMRREARQVEQSW
        """
        #Match identifier
        if not (line.startswith(">") and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
        #print '----->', line.strip(), match_descr
        match_descr = line[1:].split()[0] + match_descr
        
        #assert match_descr.startswith(line[1:].split()[0])
#        assert self._match_descr.startswith(line[1:].split()[0])

        #Tagged data,
        line = handle.readline()
        line = self._parse_tag_section(line, match_annotation)
        assert not line.startswith("; ")
        
        #Now should have the aligned query sequence with flanking region...
        while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'):
            match_seq_parts.append(line.strip())
            line = handle.readline()

        if line.startswith('>') or '>>>' in line:
            self._header = line

        #We built a list of strings and then joined them because
        #its faster than appending to a string.
        query_seq = "".join(query_seq_parts)
        match_seq = "".join(match_seq_parts)
        del query_seq_parts, match_seq_parts
        #Note, query_seq and match_seq will usually be of different lengths, apparently
        #because in the m10 format leading gaps are added but not trailing gaps!

        #Remove the flanking regions,
        query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
        match_align_seq = self._extract_alignment_region(match_seq, match_annotation)

        #The "sq_offset" values can be specified with the -X command line option.
        #The appear to just shift the origin used in the calculation of the coordinates.
        
        if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \
        or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") :
            #Note that until some point in the v35 series, FASTA always recorded one
            #for the query offset, and ommitted the match offset (even when these were
            #query_seq the -X command line option).
            #TODO - Work out how exactly the use of -X offsets changes things.
            #raise ValueError("Offsets from the -X command line option are not (yet) supported")
            pass

# this is not useful when using stretcher
#        if len(query_align_seq) != len(match_align_seq) :
#            raise ValueError("Problem parsing the alignment sequence coordinates")
        if "sw_overlap" in alignment_annotation :
            if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) :
                raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
                                 % (alignment_annotation["sw_overlap"],
                                    len(query_align_seq)))

        #TODO - Look at the "sq_type" to assign a sensible alphabet?
        alignment = Alignment(self.alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}
        
        #Want to record both the query header tags, and the alignment tags.
        for key, value in self._query_header_annotation.iteritems() :
            alignment._annotations[key] = value
        for key, value in alignment_annotation.iteritems() :
            alignment._annotations[key] = value
            

        #TODO - Once the alignment object gets an append method, use it.
        #(i.e. an add SeqRecord method)
        alignment.add_sequence(self._query_descr, query_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == self._query_descr or record.description == self._query_descr
        assert record.seq.tostring() == query_align_seq
        record.id = self._query_descr.split()[0].strip(",")
        record.name = "query"
        record.annotations["original_length"] = int(query_annotation["sq_len"])
        # Roba mia
        for k in query_annotation.keys():
            record.annotations[k] = query_annotation[k]

        alignment.add_sequence(match_descr, match_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == match_descr or record.description == match_descr
        assert record.seq.tostring() == match_align_seq
        record.id = match_descr.split()[0].strip(",")
        record.name = "match"
        record.annotations["original_length"] = int(match_annotation["sq_len"])
        # Roba mia
        for k in query_annotation.keys():
            record.annotations[k] = match_annotation[k]

        return alignment
Example #52
0
class Align(object):
    """docstring for Align"""
    def __init__(self, input):
        self.input = input
        self.alignment = None
        self.trimmed_alignment = None
        self.perfect_trimmed_alignment = None
    
    def _clean(self, outtemp):
        # cleanup temp file
        os.remove(outtemp)
        # cleanup input file
        os.remove(self.input)
    
    def _find_ends(self, forward=True):
        """determine the first (or last) position where all reads in an alignment 
        start/stop matching"""
        if forward:
            theRange = xrange(self.alignment.get_alignment_length())
        else:
            theRange = reversed(xrange(self.alignment.get_alignment_length()))
        for col in theRange:
            if '-' in self.alignment.get_column(col):
                pass
            else:
                break
        return col
    
    def _base_checker(self, bases, sequence, loc):
        """ensure that any trimming that occurs does not start beyong the
        end of the sequence being trimmed"""
        # deal with the case where we just want to measure out from the
        # middle of a particular sequence
        if len(loc) == 1:
            loc = (loc, loc)
        if not bases > len(sequence.seq[:loc[0]]) and \
            not bases > len(sequence.seq[loc[1]:]):
            return True
    
    def _record_formatter(self, temp):
        """return a string formatted as a biopython sequence record"""
        temp_record = SeqRecord(temp)
        temp_record.id = sequence.id
        temp_record.name = sequence.name
        temp_record.description = sequence.description
        return temp_record
    
    def _alignment_summary(self, alignment):
        """return summary data for an alignment object using the AlignInfo
        class from BioPython"""
        summary = AlignInfo.SummaryInfo(alignment)
        consensus = summary.dumb_consensus()
        return summary, consensus
    
    def _read(self, format):
        """read an alignment from the CLI - largely for testing purposes"""
        self.alignment = AlignIO.read(open(self.input,'rU'), format)
    
    def get_probe_location(self):
        '''Pull the probe sequence from an alignment object and determine its position
        within the read'''
        # probe at bottom => reverse order
        for record in self.alignment[::-1]:
            if record.id == 'probe':
                start = re.search('^-*', str(record.seq))
                end   = re.search('-*$', str(record.seq))
                # should be first record
                break
        # ooh, this seems so very backwards
        self.ploc = (start.end(), end.start(),)
    
    def run_alignment(self, clean = True, consensus = True):
        """Align, as originally written gets bogged down. Add communicate method 
        and move away from pipes for holding information (this has always been 
        problematic for me with multiprocessing).  Move to tempfile-based
        output."""
        # create results file
        fd, outtemp = tempfile.mkstemp(suffix='.align')
        os.close(fd)
        # run MUSCLE on the temp file
        cline = MuscleCommandline(input=self.input, out=outtemp)
        stdout, stderr = subprocess.Popen(str(cline),
                                 stderr=subprocess.PIPE,
                                 stdout=subprocess.PIPE,
                                 shell=True).communicate(None)
        self.alignment = AlignIO.read(open(outtemp,'rU'), "fasta", alphabet = Gapped(IUPAC.unambiguous_dna, "-"))
        # build a dumb consensus
        if consensus:
            self.alignment_summary, self.alignment_consensus = \
                self._alignment_summary(self.alignment)
        # cleanup temp files
        if clean:
            self._clean(outtemp)
    
    def running_average(self, window_size, threshold):
        # iterate across the columns of the alignment and determine presence
        # or absence of base-identity in the column
        differences = []
        for column in xrange(self.alignment.get_alignment_length()):
            column_values = self.alignment.get_column(column)
            # get the count of different bases in a column (converting
            # it to a set gets only the unique values)
            if len(set(list(column_values))) > 1:
                differences.append(0)
            else:
                differences.append(1)
        # compute the running average from the start => end of the sequence
        forward_average = []
        for start in xrange(len(differences)):
            end = start + window_size
            if end < len(differences):
                forward_average.append(sum(differences[start:end])/float(len(differences[start:end])))
        # compute the running average from the end => start of the sequence
        # we do this, because, otherwise, this end would be neglected.
        reverse_average = []
        for end in reversed(xrange(-len(differences), 0)):
            start = end - window_size
            if start > -len(differences):
                reverse_average.append(sum(differences[start:end])/float(len(differences[start:end])))
        # find where each running average first reaches some threshold 
        # identity over the run span chosen.
        for start_clip, avg in enumerate(forward_average):
            if round(avg, 1) >= float(threshold):
                break
        for temp_end_clip, avg in enumerate(reverse_average):
            if round(avg, 1) >= float(threshold):
                end_clip = len(differences) - temp_end_clip
                break
        return start_clip, end_clip
    
    def trim_alignment(self, method = 'edges', remove_probe = None, bases = None, consensus = True, window_size = 20, threshold = 0.5):
        """Trim the alignment"""
        if method == 'edges':
            # find edges of the alignment
            start   = self._find_ends(forward=True)
            end     = self._find_ends(forward=False)
        elif method == 'running':
            start, end = self.running_average(window_size, threshold)
        # create a new alignment object to hold our alignment
        self.trimmed_alignment = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
        for sequence in self.alignment:
            # ignore the probe sequence we added
            if (method == 'edges' or method == 'running') and not remove_probe:
                # it is totally retarded that biopython only gives us the option to
                # pass the Alignment object a name and str(sequence).  Given this 
                # level of retardation, we'll fudge and use their private method
                self.trimmed_alignment._records.append(sequence[start:end])  
            elif method == 'static' and not remove_probe and bases:
                # get middle of alignment and trim out from that - there's a
                # weakness here in that we are not actually locating the probe
                # region, we're just locating the middle of the alignment
                mid_point = len(sequence)/2
                if self._base_checker(bases, sequence, mid_point):
                    self.trimmed_alignment._records.append(
                        sequence[mid_point-bases:mid_point+bases]
                        )
                else:
                    self.trimmed_alignment = None
            elif method == 'static' and not remove_probe and bases and self.ploc:
                # get middle of alignment and trim out from that - there's a
                # weakness here in that we are not actually locating the probe
                # region, we're just locating the middle of the alignment
                if self._base_checker(bases, sequence, self.ploc):
                    self.trimmed_alignment._records.append(
                        sequence[self.ploc[0]-bases:self.ploc[1]+bases]
                        )
                else:
                    self.trimmed_alignment = None
            elif remove_probe and self.ploc:
                # we have to drop to sequence level to add sequence slices
                # where we basically slice around the probes location
                temp = sequence.seq[:self.ploc[0]] + sequence.seq[self.ploc[1]:]
                self.trimmed_alignment._records.append( \
                    self._record_formatter(temp)
                    )
            elif method == 'static' and remove_probe and bases and self.ploc:
                if self._base_checker(bases, sequence, self.ploc):
                    temp = sequence.seq[self.ploc[0]-bases:self.ploc[0]] + \
                        sequence.seq[self.ploc[1]:self.ploc[1]+bases]
                    self.trimmed_alignment._records.append( \
                        self._record_formatter(temp)
                        )
                else:
                    self.trimmed_alignment = None
        # build a dumb consensus
        if consensus:
            self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \
                self._alignment_summary(self.trimmed_alignment)
    
    def trim_ambiguous_bases(self):
        """snip ambiguous bases from a trimmed_alignment"""
        ambiguous_bases = []
        # do this by finaing all ambiguous bases and then snipping the largest
        # chunk with no ambiguous bases from the entire alignment
        for column in xrange(0, self.trimmed_alignment.get_alignment_length()):
            if 'N' in self.trimmed_alignment.get_column(column):
                ambiguous_bases.append(column)
        maximum = 0
        maximum_pos = None
        #pdb.set_trace()
        if ambiguous_bases:
            # prepend and append the start and end of the sequence so consider
            # those chunks outside the stop and start of ambiguous base runs.
            ambiguous_bases.insert(0,0)
            ambiguous_bases.append(self.trimmed_alignment.get_alignment_length() - 1)
            # create a new alignment object to hold our alignment
            self.perfect_trimmed_alignment = \
                Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
            for pos in xrange(len(ambiguous_bases)):
                if pos + 1 < len(ambiguous_bases):
                    difference = ambiguous_bases[pos + 1] - \
                        ambiguous_bases[pos]
                    if difference > maximum:
                        maximum = difference
                        maximum_pos = (pos, pos+1)
                else:
                    pass
            # make sure we catch cases where there is not best block
            if maximum_pos:
                for sequence in self.trimmed_alignment:
                    self.perfect_trimmed_alignment._records.append(
                        sequence[ambiguous_bases[maximum_pos[0]] + 1
                            :ambiguous_bases[maximum_pos[1]]]
                            )
            else:
                self.perfect_trimmed_alignment = None
        else:
            self.perfect_trimmed_alignment = self.trimmed_alignment
Example #53
0
    print(consensus)
    consensus = summary.gap_consensus(ambiguous="N")
    print(consensus)
    print("")
    print(summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                            axis_seq=consensus))
    print("")
    # Have a generic alphabet, without a declared gap char, so must tell
    # provide the frequencies and chars to ignore explicitly.
    print(summary.information_content(e_freq_table=expected,
                                      chars_to_ignore=['-']))
    print("")
    print("Trying a protein sequence with gaps and stops")

    alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
    a = Alignment(alpha)
    a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
    a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
    a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
    print(a)
    print("=" * a.get_alignment_length())

    s = SummaryInfo(a)
    c = s.dumb_consensus(ambiguous="X")
    print(c)
    c = s.gap_consensus(ambiguous="X")
    print(c)
    print("")
    print(s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c))

    print(s.information_content(chars_to_ignore=['-', '*']))
Example #54
0
    def next(self) :
        """Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an Alignment object containing two rows."""

        handle = self.handle
        try :
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            print self._header.strip(), '--> self_header'
            del self._header
        except AttributeError:      
            line = handle.readline()
        if not line:
            return None

        if line.startswith('#-') :
            #Reached the end of the alignments, no need to read the footer...
            return None
        if line.startswith("##") :
            #Skip the file header before the alignments.  e.g.
#            print line.strip()
            line = self._skip_file_header(line)
#        print 'Back from file header skip'
        assert line.startswith('#'), line

        while not line.startswith('#=') :
            line = self.handle.readline()

        if line.startswith('#='):
            #Moved onto the next query sequence!
            self._query_descr = ""
            self._query_header_annotation = {}
            #Read in the query header
            line = self._parse_query_header(line)
        if not line :
            #End of file
            return None

        
        assert line.startswith(">>") and not line.startswith(">>>"), line

        query_seq_parts, match_seq_parts = [], []
        query_annotation, match_annotation = {}, {}
        match_descr = ""
        alignment_annotation = {}

        #This should be followed by the target match numbering line, then more tags.
        #e.g.
        """
        >>#2
        ; sw_score: 41.0
        ; sw_ident: 0.846
        ; sw_overlap: 13
        """
        
        if not line.startswith(">>") and not line.startswith(">>>") :
            raise ValueError("Expected target line starting '>>'")
        match_descr = line[2:].strip()
        #print match_descr, 'match'
        #Handle the following "alignment hit" tagged data, e.g.
        line = handle.readline()
        line = self._parse_tag_section(line, alignment_annotation)
        assert not line.startswith("; ")

        #Then we have the alignment numbers and sequence for the query
        """
        >gi|10955265| ..
        ; sq_len: 346
        ; sq_offset: 1
        ; sq_type: p
        ; al_start: 197
        ; al_stop: 238
        ; al_display_start: 167
        DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
        QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
        GEYFTENKPKYIIREIHQET
        """
        if not (line.startswith(">") and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..'")
        assert self._query_descr.startswith(line[1:].split()[0])
        
        #Handle the following "query alignment" tagged data
        line = handle.readline()
        line = self._parse_tag_section(line, query_annotation)
        assert not line.startswith("; ")

        #Now should have the aligned query sequence (with leading flanking region)
        while not line.startswith(">") :
            query_seq_parts.append(line.strip())
            line = handle.readline()
#            print 'queryseq', line.strip()
        #Handle the following "match alignment" data
        """
        >gi|152973545|ref|YP_001338596.1| ..
        ; sq_len: 242
        ; sq_type: p
        ; al_start: 52
        ; al_stop: 94
        ; al_display_start: 22
        IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
        RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
        QDFAFTRKMRREARQVEQSW
        """
        #Match identifier
        if not (line.startswith(">") and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
        #print '----->', line.strip(), match_descr
        match_descr = line[1:].split()[0] + match_descr
        
        #assert match_descr.startswith(line[1:].split()[0])
#        assert self._match_descr.startswith(line[1:].split()[0])

        #Tagged data,
        line = handle.readline()
        line = self._parse_tag_section(line, match_annotation)
        assert not line.startswith("; ")
        
        #Now should have the aligned query sequence with flanking region...
        while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'):
            match_seq_parts.append(line.strip())
            line = handle.readline()
            if not line:
                #End of file
                return None
        if line.startswith('>') or '>>>' in line:
            self._header = line

        #We built a list of strings and then joined them because
        #its faster than appending to a string.
        query_seq = "".join(query_seq_parts)
        match_seq = "".join(match_seq_parts)
        del query_seq_parts, match_seq_parts
        #Note, query_seq and match_seq will usually be of different lengths, apparently
        #because in the m10 format leading gaps are added but not trailing gaps!

        #Remove the flanking regions,
        query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
        match_align_seq = self._extract_alignment_region(match_seq, match_annotation)

        #The "sq_offset" values can be specified with the -X command line option.
        #The appear to just shift the origin used in the calculation of the coordinates.
        
        if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \
        or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") :
            #Note that until some point in the v35 series, FASTA always recorded one
            #for the query offset, and ommitted the match offset (even when these were
            #query_seq the -X command line option).
            #TODO - Work out how exactly the use of -X offsets changes things.
            #raise ValueError("Offsets from the -X command line option are not (yet) supported")
            pass

# this is not useful when using stretcher
#        if len(query_align_seq) != len(match_align_seq) :
#            raise ValueError("Problem parsing the alignment sequence coordinates")
        if "sw_overlap" in alignment_annotation :
            if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) :
                raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
                                 % (alignment_annotation["sw_overlap"],
                                    len(query_align_seq)))

        #TODO - Look at the "sq_type" to assign a sensible alphabet?
        alignment = Alignment(self.alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}
        
        #Want to record both the query header tags, and the alignment tags.
        for key, value in self._query_header_annotation.iteritems() :
            alignment._annotations[key] = value
        for key, value in alignment_annotation.iteritems() :
            alignment._annotations[key] = value
            

        #TODO - Once the alignment object gets an append method, use it.
        #(i.e. an add SeqRecord method)
        alignment.add_sequence(self._query_descr, query_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == self._query_descr or record.description == self._query_descr
        assert record.seq.tostring() == query_align_seq
        record.id = self._query_descr.split()[0].strip(",")
        record.name = "query"
        record.annotations["original_length"] = int(query_annotation["sq_len"])
        # Roba mia
        for k in query_annotation.keys():
            record.annotations[k] = query_annotation[k]

        alignment.add_sequence(match_descr, match_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == match_descr or record.description == match_descr
        assert record.seq.tostring() == match_align_seq
        record.id = match_descr.split()[0].strip(",")
        record.name = "match"
        record.annotations["original_length"] = int(match_annotation["sq_len"])
        # Roba mia
        for k in query_annotation.keys():
            record.annotations[k] = match_annotation[k]

        return alignment
def main():
    # Configuration
    #Select the desired NCBI translation table
    translationTable = 11

    # Open the DNA sequence file and read the fasta sequences into a dictionary
    if (len(argv) > 1):
        dnaFileName = argv[1]
    else:
        dnaFileName = None
    dnaSeqFile = fileinput.input(dnaFileName)
    dnaSeqDict = SeqIO.to_dict(SeqIO.parse(dnaSeqFile, "fasta"))

    # Translate the sequences
    aaSeqRecords = []
    for key in dnaSeqDict:
        aaSeq = SeqRecord(dnaSeqDict[key].seq.translate(table=translationTable), id=key)
        aaSeqRecords.append(aaSeq)
    dnaSeqFile.close()

    # Replace stop codons with X (unknown aa) so muscle doesn't drop them
    for aaSeq in aaSeqRecords:
        noStopCodonSeq = str(aaSeq.seq).replace('*', 'X')
        aaSeq.seq = Seq(noStopCodonSeq)

    # Align the aa sequences
    commandLine = str(MuscleCommandline(seqtype='protein'))
    childProcess = subprocess.Popen(commandLine, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=(sys.platform!="win32")) #don't pipe stderr or muscle hangs
    SeqIO.write(aaSeqRecords, childProcess.stdin, "fasta")
    childProcess.stdin.close()
    aaAlignment = AlignIO.read(childProcess.stdout, "fasta")

    # Convert the aa alignment into a dna alignment
    dnaAlignment = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
    for taxon in aaAlignment:
        aaCount = 0
        dnaSeq = ''
        for aaResidue in taxon.seq:
            if (aaResidue == '-'):
                dnaSeq = dnaSeq + '---'
            else:
                dnaSeq = dnaSeq + dnaSeqDict[taxon.id].seq[aaCount*3:aaCount*3+3]
                aaCount+=1
        # As we add the sequences to the alignment remove gene name from the sequence id so they taxon match the PAML constraint tree
        dnaAlignment.add_sequence(taxon.id.split('_')[0], str(dnaSeq))
    if (dnaFileName):
        outFileName = dnaFileName.split('.')[0] + '_aln.phy'
    else:
        outFileName = 'out_aln.phy'
    outFile = open(outFileName, 'w+')
    AlignIO.write([dnaAlignment], outFile, "phylip")

#I think this section should be removed.  If I put the 'I' into the alignment file now, I can't open the alignment with BioPython-based scripts (for manual editing etc).  I can use pamlize.py to add the I right before using paml.
    # Biopython doesn't tag Interleaved phylip files and PAML requires it so...
#    outFile.seek(0,0)
#    modifiedAlignmentText = outFile.readlines()
#    modifiedAlignmentText[0] = modifiedAlignmentText[0].rstrip() + ' I\n'
#    outFile.seek(0,0)
#    outFile.writelines(modifiedAlignmentText)

    outFile.close()
Example #56
0
    def next(self) :
        """Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an Alignment object containing two rows."""
        handle = self.handle

        try :
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:      
            line = handle.readline()
        if not line:
            return None

        if line.startswith("#") :
            #Skip the file header before the alignments.  e.g.
            line = self._skip_file_header(line)
        while ">>>" in line and not line.startswith(">>>") :
            #Moved onto the next query sequence!
            self._query_descr = ""
            self._query_header_annotation = {}
            #Read in the query header
            line = self._parse_query_header(line)
            #Now should be some alignments, but if not we move onto the next query
        if not line :
            #End of file
            return None
        if ">>><<<" in line :
            #Reached the end of the alignments, no need to read the footer...
            return None


        #Should start >>... and not >>>...
        assert line[0:2] == ">>" and not line[2] == ">", line

        query_seq_parts, match_seq_parts = [], []
        query_annotation, match_annotation = {}, {}
        match_descr = ""
        alignment_annotation = {}

        #This should be followed by the target match ID line, then more tags.
        #e.g.
        """
        >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
        ; fa_frame: f
        ; fa_initn:  52
        ; fa_init1:  52
        ; fa_opt:  70
        ; fa_z-score: 105.5
        ; fa_bits: 27.5
        ; fa_expect:  0.082
        ; sw_score: 70
        ; sw_ident: 0.279
        ; sw_sim: 0.651
        ; sw_overlap: 43
        """
        if (not line[0:2] == ">>") or line[0:3] == ">>>" :
            raise ValueError("Expected target line starting '>>'")
        match_descr = line[2:].strip()
        #Handle the following "alignment hit" tagged data, e.g.
        line = handle.readline()
        line = self._parse_tag_section(line, alignment_annotation)
        assert not line[0:2] == "; "
        
        #Then we have the alignment numbers and sequence for the query
        """
        >gi|10955265| ..
        ; sq_len: 346
        ; sq_offset: 1
        ; sq_type: p
        ; al_start: 197
        ; al_stop: 238
        ; al_display_start: 167
        DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
        QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
        GEYFTENKPKYIIREIHQET
        """
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..'")
        assert self._query_descr.startswith(line[1:].split(None,1)[0])
        
        #Handle the following "query alignment" tagged data
        line = handle.readline()
        line = self._parse_tag_section(line, query_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence (with leading flanking region)
        while not line[0] == ">" :
            query_seq_parts.append(line.strip())
            line = handle.readline()
        
        #Handle the following "match alignment" data
        """
        >gi|152973545|ref|YP_001338596.1| ..
        ; sq_len: 242
        ; sq_type: p
        ; al_start: 52
        ; al_stop: 94
        ; al_display_start: 22
        IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
        RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
        QDFAFTRKMRREARQVEQSW
        """
        #Match identifier
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
        assert match_descr.startswith(line[1:].split(None,1)[0])
        
        #Tagged data,
        line = handle.readline()
        line = self._parse_tag_section(line, match_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence with flanking region...
        #but before that, since FASTA 35.4.1 there can be an consensus here,
        """
        ; al_cons:
        .::. : :. ---.  :: :. . :  ..-:::-:  :.:  ..:...: 
        etc
        """
        while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
            match_seq_parts.append(line.strip())
            line = handle.readline()
        if line[0:2] == "; " :
            assert line.strip() == "; al_cons:"
            align_consensus_parts = []
            line = handle.readline()
            while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
                align_consensus_parts.append(line.strip())
                line = handle.readline()
            #If we do anything with this in future, must remove any flanking region.
            align_consensus = "".join(align_consensus_parts)
            del align_consensus_parts
            assert not line[0:2] == "; "
        else :
            align_consensus = None
        assert (line[0] == ">" or ">>>" in line)
        self._header = line

        #We built a list of strings and then joined them because
        #its faster than appending to a string.
        query_seq = "".join(query_seq_parts)
        match_seq = "".join(match_seq_parts)
        del query_seq_parts, match_seq_parts
        #Note, query_seq and match_seq will usually be of different lengths, apparently
        #because in the m10 format leading gaps are added but not trailing gaps!

        #Remove the flanking regions,
        query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
        match_align_seq = self._extract_alignment_region(match_seq, match_annotation)
        #How can we do this for the (optional) consensus?

        #The "sq_offset" values can be specified with the -X command line option.
        #They appear to just shift the origin used in the calculation of the coordinates.
        
        if len(query_align_seq) != len(match_align_seq) :
            raise ValueError("Problem parsing the alignment sequence coordinates, " 
                             "following should be the same length but are not:\n"
                             "%s - len %i\n%s - len %i" % (query_align_seq,
                                                           len(query_align_seq),
                                                           match_align_seq,
                                                           len(match_align_seq)))
        if "sw_overlap" in alignment_annotation :
            if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) :
                raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
                                 % (alignment_annotation["sw_overlap"],
                                    len(query_align_seq)))

        #TODO - Look at the "sq_type" to assign a sensible alphabet?
        alphabet = self.alphabet
        alignment = Alignment(alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}
        
        #Want to record both the query header tags, and the alignment tags.
        for key, value in self._query_header_annotation.iteritems() :
            alignment._annotations[key] = value
        for key, value in alignment_annotation.iteritems() :
            alignment._annotations[key] = value
            

        #TODO - Once the alignment object gets an append method, use it.
        #(i.e. an add SeqRecord method)
        alignment.add_sequence(self._query_descr, query_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == self._query_descr or record.description == self._query_descr
        #assert record.seq.tostring() == query_align_seq
        record.id = self._query_descr.split(None,1)[0].strip(",")
        record.name = "query"
        record.annotations["original_length"] = int(query_annotation["sq_len"])

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_annotation :
            if query_annotation["sq_type"] == "D" :
                record.seq.alphabet = generic_dna
            elif query_annotation["sq_type"] == "p" :
                record.seq.alphabet = generic_protein
        if "-" in query_align_seq :
            if not hasattr(record.seq.alphabet,"gap_char") :
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")
        
        alignment.add_sequence(match_descr, match_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == match_descr or record.description == match_descr
        #assert record.seq.tostring() == match_align_seq
        record.id = match_descr.split(None,1)[0].strip(",")
        record.name = "match"
        record.annotations["original_length"] = int(match_annotation["sq_len"])

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_annotation :
            if match_annotation["sq_type"] == "D" :
                record.seq.alphabet = generic_dna
            elif match_annotation["sq_type"] == "p" :
                record.seq.alphabet = generic_protein
        if "-" in match_align_seq :
            if not hasattr(record.seq.alphabet,"gap_char") :
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Example #57
0
    def next(self):

        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            return None

        while line.rstrip() != "#=======================================":
            line = handle.readline()
            if not line:
                return None

        length_of_seqs = None
        number_of_seqs = None
        ids = []
        seqs = []

        while line[0] == "#":
            #Read in the rest of this alignment header,
            #try and discover the number of records expected
            #and their length
            parts = line[1:].split(":", 1)
            key = parts[0].lower().strip()
            if key == "aligned_sequences":
                number_of_seqs = int(parts[1].strip())
                assert len(ids) == 0
                # Should now expect the record identifiers...
                for i in range(number_of_seqs):
                    line = handle.readline()
                    parts = line[1:].strip().split(":", 1)
                    assert i + 1 == int(parts[0].strip())
                    ids.append(parts[1].strip())
                assert len(ids) == number_of_seqs
            if key == "length":
                length_of_seqs = int(parts[1].strip())

            #And read in another line...
            line = handle.readline()

        if number_of_seqs is None:
            raise ValueError("Number of sequences missing!")
        if length_of_seqs is None:
            raise ValueError("Length of sequences missing!")

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (number_of_seqs, self.records_per_alignment))

        seqs = ["" for id in ids]
        index = 0

        #Parse the seqs
        while line:
            if len(line) > 21:
                id_start = line[:21].strip().split(None, 1)
                seq_end = line[21:].strip().split(None, 1)
                if len(id_start) == 2 and len(seq_end) == 2:
                    #identifier, seq start position, seq, seq end position
                    #(an aligned seq is broken up into multiple lines)
                    id, start = id_start
                    seq, end = seq_end

                    #The identifier is truncated...
                    assert 0 <= index and index < number_of_seqs, \
                           "Expected index %i in range [0,%i)" \
                           % (index, number_of_seqs)
                    assert id == ids[index] or id == ids[index][:len(id)]

                    #Check the start...
                    if int(start) == 0:
                        #Special case when one sequence starts long before the other
                        assert len(seqs[index].replace("-", "")) == 0
                        assert len(seq.replace("-", "")) == 0, line
                    elif int(start) == len(seqs[index].replace("-", "")):
                        #Special case when one sequence ends long before the other
                        assert len(seq.replace("-", "")) == 0, line
                    else:
                        assert int(start) - 1 == len(seqs[index].replace("-","")), \
                        "Found %i chars so far for sequence %i (%s), file says start %i:\n%s" \
                            % (len(seqs[index].replace("-","")), index, id,
                               int(start), seqs[index])

                    seqs[index] += seq

                    #Check the end ...
                    assert int(end) == len(seqs[index].replace("-","")), \
                        "Found %i chars so far for %s, file says end %i:\n%s" \
                            % (len(seqs[index]), id, int(end), repr(seqs[index]))

                    index += 1
                    if index >= number_of_seqs:
                        index = 0
                else:
                    #just a start value, this is just alignment annotation (?)
                    #print "Skipping: " + line.rstrip()
                    pass
            elif line.strip() == "":
                #Just a spacer?
                pass
            else:
                print line
                assert False

            line = handle.readline()
            if line.rstrip() == "#---------------------------------------" \
            or line.rstrip() == "#=======================================" :
                #End of alignment
                self._header = line
                break

        assert index == 0

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids) :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (len(ids), self.records_per_alignment))

        alignment = Alignment(self.alphabet)
        for id, seq in zip(ids, seqs):
            if len(seq) != length_of_seqs:
                #EMBOSS 2.9.0 is known to use spaces instead of minus signs
                #for leading gaps, and thus fails to parse.  This old version
                #is still used as of Dec 2008 behind the EBI SOAP webservice:
                #http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl
                raise ValueError("Error parsing alignment - sequences of "
                                 "different length? You could be using an "
                                 "old version of EMBOSS.")
            alignment.add_sequence(id, seq)
        return alignment
Example #58
0
 def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)):
     Alignment.__init__(self, alphabet)