Ejemplo n.º 1
0
def main():
    args = get_args()
    # iterate through all the files to determine the longest alignment
    files = get_files(args.nexus)
    old_names = set()
    for f in files:
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                old_names.update([seq.name])
    #pdb.set_trace()
    name_map = abbreviator(old_names)
    for count, f in enumerate(files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        #filename = os.path.basename(f)
        #chromo_name = filename.split('.')[0]
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                new_seq_name = name_map[seq.name]
                new_align.add_sequence(new_seq_name, str(seq.seq))
        #pdb.set_trace()
        outf = os.path.join(args.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        except ValueError:
            pdb.set_trace()
        print count
def ace2fasta(in_file, out_file):
    ace_gen = Ace.parse(open(in_file, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "All contigs treated"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            
            # Now we have started our alignment we can add sequences to it 
            # Add concensus sequence to alignment
            align.add_sequence(contig.name, contig.sequence)
            
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            
            output_file.write(align.format("fasta"))
Ejemplo n.º 3
0
 def _domain_alignment(self,alignment,domain_region, alignment_index):
     # Now we need to subselect the portion of the alignment 
     # that contains the domain.
     protein_record = alignment[alignment_index]
     protein_seq = str(protein_record.seq)
     # Figure out which columns encapsulate the domain.
     aa_count = 0
     column_start = None
     column_stop = None
     #print protein_seq
     for column,aa in enumerate(protein_seq):
         #print column,aa
         if aa!='-':
             aa_count=aa_count+1
         if aa_count==domain_region.start and column_start==None:
             column_start = column
         if aa_count==domain_region.stop and column_stop==None:
             column_stop = column
             break
     #print column_start,column_stop
     assert column_start != None, str(column_start)
     assert column_stop != None, str(column_stop)
     domain_alignment = Alignment(alphabet = alignment._alphabet)
     # Grab the portion of each sequence that correspond to columns
     # for the domain.
     for record in alignment:
         domain_alignment.add_sequence(record.id,
                                       str(record.seq)[column_start:column_stop])
     return (domain_alignment, column_start, column_stop)
def add_gaps_to_align(organisms, missing, align, verbatim=False, genera=False, min_taxa=3):
    local_organisms = copy.deepcopy(organisms)
    for a in align:
        if len(a) < min_taxa:
            new_align = None
            break
        elif len(a) >= min_taxa:
            #pdb.set_trace()
            new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
            overall_length = len(a[0])
            for seq in a:
                if genera and any(sp for sp in genera if sp in seq.name):
                    new_seq_name = '_'.join(seq.name.split('_')[-1:])
                elif not verbatim:
                    new_seq_name = '_'.join(seq.name.split('_')[-2:])
                else:
                    new_seq_name = seq.name.lower()
                new_align.add_sequence(new_seq_name, str(seq.seq))
                local_organisms.remove(new_seq_name)
            for org in local_organisms:
                if genera and any(sp for sp in genera if sp in seq.name):
                    loc = '_'.join(seq.name.split('_')[:-1])
                elif not verbatim:
                    loc = '_'.join(seq.name.split('_')[:-2])
                else:
                    loc = seq.name
                if missing:
                    try:
                        assert loc in missing[org], "Locus missing"
                    except:
                        assert loc in missing['{}*'.format(org)], "Locus missing"
                new_align.add_sequence(org, '?' * overall_length)
    return new_align
Ejemplo n.º 5
0
def main():
    options, args = interface()
    # iterate through all the files to determine the longest alignment
    files = get_files(options.input)
    for count, f in enumerate(files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        #filename = os.path.basename(f)
        #chromo_name = filename.split('.')[0]
        for align in AlignIO.parse(f, 'nexus'):
            for seq in list(align):
                if '.copy' in seq.name:
                    pass
                else:
                    #pdb.set_trace()
                    #new_seq_name = seq.name.split('|')[0]
                    new_seq_name = '_'.join(
                        seq.name.split('_')[options.position:])
                    new_align.add_sequence(new_seq_name, str(seq.seq))
        #pdb.set_trace()
        outf = os.path.join(options.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        except ValueError:
            pdb.set_trace()
        print count
Ejemplo n.º 6
0
def ace2fasta(in_file, out_file):
    ace_gen = Ace.parse(open(in_file, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "All contigs treated"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))

            # Now we have started our alignment we can add sequences to it
            # Add concensus sequence to alignment
            align.add_sequence(contig.name, contig.sequence)

            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)

            output_file.write(align.format("fasta"))
Ejemplo n.º 7
0
 def _domain_alignment(self, alignment, domain_region, alignment_index):
     # Now we need to subselect the portion of the alignment
     # that contains the domain.
     protein_record = alignment[alignment_index]
     protein_seq = str(protein_record.seq)
     # Figure out which columns encapsulate the domain.
     aa_count = 0
     column_start = None
     column_stop = None
     #print protein_seq
     for column, aa in enumerate(protein_seq):
         #print column,aa
         if aa != '-':
             aa_count = aa_count + 1
         if aa_count == domain_region.start and column_start == None:
             column_start = column
         if aa_count == domain_region.stop and column_stop == None:
             column_stop = column
             break
     #print column_start,column_stop
     assert column_start != None, str(column_start)
     assert column_stop != None, str(column_stop)
     domain_alignment = Alignment(alphabet=alignment._alphabet)
     # Grab the portion of each sequence that correspond to columns
     # for the domain.
     for record in alignment:
         domain_alignment.add_sequence(
             record.id,
             str(record.seq)[column_start:column_stop])
     return (domain_alignment, column_start, column_stop)
Ejemplo n.º 8
0
def createAlignment(sequences, alphabet):
    """Create an Alignment object from a list of sequences"""
    align = Alignment(alphabet)
    counter = 0
    for sequence in sequences:
        name = "sequence" + str(counter)
        align.add_sequence(name, sequence)
        counter+=1
    return align
Ejemplo n.º 9
0
Archivo: oid.py Proyecto: bsmithers/hpf
def phylip(handle):
    seqs,columns = handle.readline().split()
    from Bio.Align.Generic import Alignment
    from Bio.Alphabet import IUPAC, Gapped
    alignment = Alignment(Gapped(IUPAC.protein, "-"))
    for line in handle:
        name,seq = line.split()
        alignment.add_sequence(name, seq)
    return alignment
Ejemplo n.º 10
0
 def build_align( self, seq ):
     align = Alignment( Gapped( DNAAlphabet() ) )
     alphabet = self.alphabet
     len_seq = len( seq )
     step = self.segment_size
     for j in range( 0, len_seq, step ):
         segment = seq[j : j + step]
         align.add_sequence( name, segment )
     self.friendly = align
Ejemplo n.º 11
0
def createAlignment(sequences, alphabet):
    """Create an Alignment object from a list of sequences"""
    align = Alignment(alphabet)
    counter = 0
    for sequence in sequences:
        name = "sequence" + str(counter)
        align.add_sequence(name, sequence)
        counter += 1
    return align
Ejemplo n.º 12
0
 def testCulledColumnMapper(self):
     align = Alignment(Gapped(IUPAC.protein, "-"))
     original = "ABCDEFGHI"
     align.add_sequence("test",original)
     culled = [0,1,4,8]
     # should yield
     result = "CDFGH"
     mapper = CulledColumnMapper(align,culled)
     for i,aa in enumerate(result):
         assert original[mapper[i]]==aa
Ejemplo n.º 13
0
def gene_expression_2matrix(in_ace, out_file, tags, min_seq):
    """Count sequences with each tags in all contigs.
    
    """
    print
    print "USING MATRIX OUTPUT FORMAT"
    print
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("gene_name\tgene_length")
        for tag in tags:
            output_file.write("\t" + tag)
        output_file.write("\tXX_noTag")
        output_file.write("\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta_2list(align.format("fasta"))
            if len(sequences) < min_seq:
                continue
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            contig_seq = sequences[0][1].replace("*", "")
            contig_length = str(len(contig_seq))
            output_file.write(contig_name + "\t" + contig_length)
            print "Treating", contig_name
            d = defaultdict(int)
            for tag in tags:
                d[tag] = 0
            d["XX_noTag"] = 0
            fasta_counter = 0
            for fasta in sequences:
                fasta_counter += 1
                found_tag = 0
                for tag in tags:
                    if fasta[0].find(tag) > -1:
                        d[tag] += 1
                        found_tag = 1
                if found_tag == 0 and fasta[0].find("Consensus") < 0:
                    d["XX_noTag"] += 1
            for tag in sorted(d):
                output_file.write("\t" + str(d[tag]))
            output_file.write("\n")
def gene_expression_2matrix(in_ace, out_file, tags, min_seq):
    """Count sequences with each tags in all contigs.
    
    """
    print
    print "USING MATRIX OUTPUT FORMAT"
    print
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("gene_name\tgene_length")
        for tag in tags:
            output_file.write("\t" + tag)
        output_file.write("\tXX_noTag")
        output_file.write("\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta_2list(align.format("fasta"))
            if len(sequences) < min_seq:
                continue
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            contig_seq = sequences[0][1].replace("*", "")
            contig_length = str(len(contig_seq))
            output_file.write(contig_name + "\t" + contig_length)
            print "Treating", contig_name
            d = defaultdict(int)
            for tag in tags:
                d[tag] = 0
            d["XX_noTag"] = 0
            fasta_counter = 0
            for fasta in sequences:
                fasta_counter += 1
                found_tag = 0
                for tag in tags:
                    if fasta[0].find(tag) > -1:
                        d[tag] += 1
                        found_tag = 1
                if found_tag == 0 and fasta[0].find("Consensus") < 0:
                    d["XX_noTag"] += 1
            for tag in sorted(d):
                output_file.write("\t" + str(d[tag]))
            output_file.write("\n")
Ejemplo n.º 15
0
def main():
    args = get_args()
    nexus_files = get_files(args.input)
    for count, align_file in enumerate(nexus_files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        for align in AlignIO.parse(align_file, "nexus"):
            for taxon in list(align):
                if taxon.name not in args.taxa:
                    new_align.add_sequence(taxon.name, str(taxon.seq))
        outf = os.path.join(args.output, os.path.basename(align_file))
        AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        print count
def rename(align, first, second):
    for a in align:
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        for seq in a:
            split_name = seq.id.split('_')
            #pdb.set_trace()
            if first and second:
                new_seq_name = '_'.join([split_name[first][0:3], split_name[second][0:3]])
            elif not second:
                new_seq_name = split_name[first]
            new_align.add_sequence(new_seq_name, str(seq.seq))
        yield new_align
def rename(align, first, second):
    for a in align:
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        for seq in a:
            split_name = seq.id.split('_')
            #pdb.set_trace()
            if first and second:
                new_seq_name = '_'.join(
                    [split_name[first][0:3], split_name[second][0:3]])
            elif not second:
                new_seq_name = split_name[first]
            new_align.add_sequence(new_seq_name, str(seq.seq))
        yield new_align
def main():
    args = get_args()
    nexus_files = get_files(args.input)
    taxa = get_all_taxon_names(nexus_files)
    taxa_to_keep = get_samples_to_run(args, taxa)
    for count, align_file in enumerate(nexus_files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        for align in AlignIO.parse(align_file, "nexus"):
            for taxon in list(align):
                if taxon.name in taxa_to_keep:
                    new_align.add_sequence(taxon.name, str(taxon.seq))
        outf = os.path.join(args.output, os.path.basename(align_file))
        AlignIO.write(new_align, open(outf, 'w'), 'nexus')
        print count
Ejemplo n.º 19
0
def proteins_alignment_to_biopython(al, seq1, seq2, name1, name2):
    "Convert our internal alignment format into BioPython Alignment"
    s1 = ""
    s2 = ""
    align = Alignment(Gapped(IUPAC.protein, "-"))
    for a, b in al:
        if a!=-1:
            s1 += seq1[a].upper()
        else:
            s1 += "-"
        if b!=-1:
            s2 += seq2[b].upper()
    align.add_sequence(name1, s1)
    align.add_sequence(name2, s2)
    return align
def parse_ace(ace_file):
	ace_gen = Ace.parse(open(ace_file, 'r'))
	contig = ace_gen.next()
	align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
	align.add_sequence(contig.name, contig.sequence)
 
	for readn in range(len(contig.reads)):
		clipst = contig.reads[readn].qa.qual_clipping_start
		clipe = contig.reads[readn].qa.qual_clipping_end
		start = contig.af[readn].padded_start
		seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)

		seq = pad_read(seq, start, len(contig.sequence))
		align.add_sequence(contig.reads[readn].rd.name + "_" + contig.af[readn].coru, seq)

	return contig, align
Ejemplo n.º 21
0
 def get_alignment(self):
     """Construct an alignment from the aligned sequences in this tree."""
     def seq_is_aligned(node):
         if isinstance(node, Sequence) and node.mol_seq.is_aligned:
             return True
         return False
     seqs = self.depth_first_search(self, seq_is_aligned)
     try:
         first_seq = seqs.next()
     except StopIteration:
         warnings.warn("No aligned sequences were found in this tree.",
                 Warning, stacklevel=2)
     aln = Alignment(first_seq.get_alphabet())
     aln.add_sequence(str(first_seq), first_seq.mol_seq.value)
     for seq in seqs:
         aln.add_sequence(str(seq), seq.mol_seq.value)
     return aln
Ejemplo n.º 22
0
 def get_alignment(self):
     """Construct an alignment from the aligned sequences in this tree."""
     def is_aligned_seq(node):
         if isinstance(node, Sequence) and node.mol_seq.is_aligned:
             return True
         return False
     seqs = self._filter_search(is_aligned_seq, 'preorder', True)
     try:
         first_seq = seqs.next()
     except StopIteration:
         # No aligned sequences were found
         # Can't construct an Alignment without an alphabet, so... nothin'
         return
     aln = Alignment(first_seq.get_alphabet())
     aln.add_sequence(str(first_seq), first_seq.mol_seq.value)
     for seq in seqs:
         aln.add_sequence(str(seq), seq.mol_seq.value)
     return aln
Ejemplo n.º 23
0
def strarray2biopy(align):
    """ take a 2d character array with an associated ID list 
    and convert it into a biopython DNA alignment."""
        
    seqs = align[0]
    ids = align[1]

    alphabet = Gapped(IUPAC.unambiguous_dna) 
    alignment = Alignment(alphabet) 
       
    for count, array_seq in enumerate(seqs):
        bases = ''
            
        for base in array_seq:
            bases += base
                
        alignment.add_sequence(ids[count],bases)
        
    return alignment
def add_gaps_to_align(organisms,
                      missing,
                      align,
                      verbatim=False,
                      genera=False,
                      min_taxa=3):
    local_organisms = copy.deepcopy(organisms)
    for a in align:
        if len(a) < min_taxa:
            new_align = None
            break
        elif len(a) >= min_taxa:
            #pdb.set_trace()
            new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
            overall_length = len(a[0])
            for seq in a:
                if genera and any(sp for sp in genera if sp in seq.name):
                    new_seq_name = '_'.join(seq.name.split('_')[-1:])
                elif not verbatim:
                    new_seq_name = '_'.join(seq.name.split('_')[-2:])
                else:
                    new_seq_name = seq.name.lower()
                new_align.add_sequence(new_seq_name, str(seq.seq))
                local_organisms.remove(new_seq_name)
            for org in local_organisms:
                if genera and any(sp for sp in genera if sp in seq.name):
                    loc = '_'.join(seq.name.split('_')[:-1])
                elif not verbatim:
                    loc = '_'.join(seq.name.split('_')[:-2])
                else:
                    loc = seq.name
                if missing:
                    try:
                        assert loc in missing[org], "Locus missing"
                    except:
                        assert loc in missing['{}*'.format(
                            org)], "Locus missing"
                new_align.add_sequence(org, '?' * overall_length)
    return new_align
def main():
    options, args = interface()
    # iterate through all the files to determine the longest alignment
    files = get_files(options.input)
    for count, f in enumerate(files):
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        # filename = os.path.basename(f)
        # chromo_name = filename.split('.')[0]
        for align in AlignIO.parse(f, "nexus"):
            for seq in list(align):
                if ".copy" in seq.name:
                    pass
                else:
                    # pdb.set_trace()
                    # new_seq_name = seq.name.split('|')[0]
                    new_seq_name = "_".join(seq.name.split("_")[options.position :])
                    new_align.add_sequence(new_seq_name, str(seq.seq))
        # pdb.set_trace()
        outf = os.path.join(options.output, os.path.split(f)[1])
        try:
            AlignIO.write(new_align, open(outf, "w"), "nexus")
        except ValueError:
            pdb.set_trace()
        print count
Ejemplo n.º 26
0
def formatData (AlignData, Score):
	LIMIT1 = 450
	LIMIT2 = 2000
	i = 0;
	ScorePoints = []
        for i in xrange(6196):
                ScorePoints.append(0)
	i = 0
	for record in AlignData.Alignment:
	        #print "Here"
		j = 0
		for c in record.seq.tostring():
			if (Score[j] <= LIMIT1):
				if c != '-':
					ScorePoints[i] -= 2
			if (Score[j] >= LIMIT2):
				if c != '-':
					ScorePoints[i] += 2
				else:
					ScorePoints[i] -= 1		
				#NewAlignData.add_sequence(record.seq.tostring(),record.id)
			j += 1
					
		i+=1
#	return NewAlignData
#	return ScorePoints
	i = 0
	DataList = list()
	for record in AlignData.Alignment:
		if(ScorePoints[i] >= -250):
			NewAlignData = Alignment(Gapped(IUPAC.protein,"-"))
			NewAlignData.add_sequence(record.id,record.seq.tostring())
			DataList.append(NewAlignData)
		i+=1

	return DataList
Ejemplo n.º 27
0
    def next(self):
        try:
            line = self._header
            del self._header
        except AttributeError:
            line = self.handle.readline()
        if not line:
            #Empty file - just give up.
            return
        if not line.strip() == '# STOCKHOLM 1.0':
            raise ValueError("Did not find STOCKHOLM header")
            #import sys
            #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0'

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while 1:
            line = self.handle.readline()
            if not line: break  #end of file
            line = line.strip()  #remove trailing \n
            if line == '# STOCKHOLM 1.0':
                self._header = line
                break
            elif line == "//":
                #The "//" line indicates the end of the alignment.
                #There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                #blank line, ignore
                pass
            elif line[0] != "#":
                #Sequence
                #Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    #This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier " \
                                      + "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, '')
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                #Comment line or meta-data
                if line[:5] == "#=GF ":
                    #Generic per-File annotation, free text
                    #Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    #Each feature key could be used more than once,
                    #so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == '#=GC ':
                    #Generic per-Column annotation, exactly 1 char per column
                    #Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == '#=GS ':
                    #Generic per-Sequence annotation, free text
                    #Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    #if id not in ids :
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    #Generic per-Sequence AND per-Column markup
                    #Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    #if id not in ids :
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip(
                    )  # append to any previous entry
                    #TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            #Next line...

        assert len(seqs) <= len(ids)
        #assert len(gs)   <= len(ids)
        #assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None \
            and self.records_per_alignment != len(ids) :
                raise ValueError("Found %i records in this alignment, told to expect %i" \
                                 % (len(ids), self.records_per_alignment))

            alignment = Alignment(self.alphabet)

            #TODO - Introduce an annotated alignment class?
            #For now, store the annotation a new private property:
            alignment._annotations = gr

            alignment_length = len(seqs.values()[0])
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )
                name, start, end = self._identifier_split(id)
                alignment.add_sequence(id, seq, start=start, end=end)

                record = alignment.get_all_seqs()[-1]

                assert record.id == id or record.description == id

                record.id = id
                record.name = name
                record.description = id

                #will be overridden by _populate_meta_data if an explicit
                #accession is provided:
                record.annotations["accession"] = name

                self._populate_meta_data(id, record)
            return alignment
        else:
            return None
Ejemplo n.º 28
0
    def next(self) :
        """Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an Alignment object containing two rows."""
        handle = self.handle

        try :
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:      
            line = handle.readline()
        if not line:
            return None

        if line.startswith("#") :
            #Skip the file header before the alignments.  e.g.
            line = self._skip_file_header(line)
        while ">>>" in line and not line.startswith(">>>") :
            #Moved onto the next query sequence!
            self._query_descr = ""
            self._query_header_annotation = {}
            #Read in the query header
            line = self._parse_query_header(line)
            #Now should be some alignments, but if not we move onto the next query
        if not line :
            #End of file
            return None
        if ">>><<<" in line :
            #Reached the end of the alignments, no need to read the footer...
            return None


        #Should start >>... and not >>>...
        assert line[0:2] == ">>" and not line[2] == ">", line

        query_seq_parts, match_seq_parts = [], []
        query_annotation, match_annotation = {}, {}
        match_descr = ""
        alignment_annotation = {}

        #This should be followed by the target match ID line, then more tags.
        #e.g.
        """
        >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
        ; fa_frame: f
        ; fa_initn:  52
        ; fa_init1:  52
        ; fa_opt:  70
        ; fa_z-score: 105.5
        ; fa_bits: 27.5
        ; fa_expect:  0.082
        ; sw_score: 70
        ; sw_ident: 0.279
        ; sw_sim: 0.651
        ; sw_overlap: 43
        """
        if (not line[0:2] == ">>") or line[0:3] == ">>>" :
            raise ValueError("Expected target line starting '>>'")
        match_descr = line[2:].strip()
        #Handle the following "alignment hit" tagged data, e.g.
        line = handle.readline()
        line = self._parse_tag_section(line, alignment_annotation)
        assert not line[0:2] == "; "
        
        #Then we have the alignment numbers and sequence for the query
        """
        >gi|10955265| ..
        ; sq_len: 346
        ; sq_offset: 1
        ; sq_type: p
        ; al_start: 197
        ; al_stop: 238
        ; al_display_start: 167
        DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
        QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
        GEYFTENKPKYIIREIHQET
        """
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..'")
        assert self._query_descr.startswith(line[1:].split(None,1)[0])
        
        #Handle the following "query alignment" tagged data
        line = handle.readline()
        line = self._parse_tag_section(line, query_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence (with leading flanking region)
        while not line[0] == ">" :
            query_seq_parts.append(line.strip())
            line = handle.readline()
        
        #Handle the following "match alignment" data
        """
        >gi|152973545|ref|YP_001338596.1| ..
        ; sq_len: 242
        ; sq_type: p
        ; al_start: 52
        ; al_stop: 94
        ; al_display_start: 22
        IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
        RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
        QDFAFTRKMRREARQVEQSW
        """
        #Match identifier
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
        assert match_descr.startswith(line[1:].split(None,1)[0])
        
        #Tagged data,
        line = handle.readline()
        line = self._parse_tag_section(line, match_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence with flanking region...
        #but before that, since FASTA 35.4.1 there can be an consensus here,
        """
        ; al_cons:
        .::. : :. ---.  :: :. . :  ..-:::-:  :.:  ..:...: 
        etc
        """
        while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
            match_seq_parts.append(line.strip())
            line = handle.readline()
        if line[0:2] == "; " :
            assert line.strip() == "; al_cons:"
            align_consensus_parts = []
            line = handle.readline()
            while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
                align_consensus_parts.append(line.strip())
                line = handle.readline()
            #If we do anything with this in future, must remove any flanking region.
            align_consensus = "".join(align_consensus_parts)
            del align_consensus_parts
            assert not line[0:2] == "; "
        else :
            align_consensus = None
        assert (line[0] == ">" or ">>>" in line)
        self._header = line

        #We built a list of strings and then joined them because
        #its faster than appending to a string.
        query_seq = "".join(query_seq_parts)
        match_seq = "".join(match_seq_parts)
        del query_seq_parts, match_seq_parts
        #Note, query_seq and match_seq will usually be of different lengths, apparently
        #because in the m10 format leading gaps are added but not trailing gaps!

        #Remove the flanking regions,
        query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
        match_align_seq = self._extract_alignment_region(match_seq, match_annotation)
        #How can we do this for the (optional) consensus?

        #The "sq_offset" values can be specified with the -X command line option.
        #They appear to just shift the origin used in the calculation of the coordinates.
        
        if len(query_align_seq) != len(match_align_seq) :
            raise ValueError("Problem parsing the alignment sequence coordinates, " 
                             "following should be the same length but are not:\n"
                             "%s - len %i\n%s - len %i" % (query_align_seq,
                                                           len(query_align_seq),
                                                           match_align_seq,
                                                           len(match_align_seq)))
        if "sw_overlap" in alignment_annotation :
            if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) :
                raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
                                 % (alignment_annotation["sw_overlap"],
                                    len(query_align_seq)))

        #TODO - Look at the "sq_type" to assign a sensible alphabet?
        alphabet = self.alphabet
        alignment = Alignment(alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}
        
        #Want to record both the query header tags, and the alignment tags.
        for key, value in self._query_header_annotation.iteritems() :
            alignment._annotations[key] = value
        for key, value in alignment_annotation.iteritems() :
            alignment._annotations[key] = value
            

        #TODO - Once the alignment object gets an append method, use it.
        #(i.e. an add SeqRecord method)
        alignment.add_sequence(self._query_descr, query_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == self._query_descr or record.description == self._query_descr
        #assert record.seq.tostring() == query_align_seq
        record.id = self._query_descr.split(None,1)[0].strip(",")
        record.name = "query"
        record.annotations["original_length"] = int(query_annotation["sq_len"])

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_annotation :
            if query_annotation["sq_type"] == "D" :
                record.seq.alphabet = generic_dna
            elif query_annotation["sq_type"] == "p" :
                record.seq.alphabet = generic_protein
        if "-" in query_align_seq :
            if not hasattr(record.seq.alphabet,"gap_char") :
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")
        
        alignment.add_sequence(match_descr, match_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == match_descr or record.description == match_descr
        #assert record.seq.tostring() == match_align_seq
        record.id = match_descr.split(None,1)[0].strip(",")
        record.name = "match"
        record.annotations["original_length"] = int(match_annotation["sq_len"])

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_annotation :
            if match_annotation["sq_type"] == "D" :
                record.seq.alphabet = generic_dna
            elif match_annotation["sq_type"] == "p" :
                record.seq.alphabet = generic_protein
        if "-" in match_align_seq :
            if not hasattr(record.seq.alphabet,"gap_char") :
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Ejemplo n.º 29
0
from Bio.Align.FormatConvert import FormatConverter
from Bio.Align import AlignInfo
from Bio.Fasta import FastaAlign
from Bio.SubsMat import FreqTable
from Bio.Align.Generic import Alignment

#Very simple tests on an empty alignment
alignment = Alignment(Alphabet.generic_alphabet)
assert alignment.get_alignment_length() == 0
assert alignment.get_all_seqs() == []
del alignment

#Basic tests on simple three string alignment
alignment = Alignment(Alphabet.generic_alphabet)
letters = "AbcDefGhiJklMnoPqrStuVwxYz"
alignment.add_sequence("mixed", letters)
alignment.add_sequence("lower", letters.lower())
alignment.add_sequence("upper", letters.upper())
assert alignment.get_alignment_length() == 26
assert len(alignment.get_all_seqs()) == 3
assert alignment.get_seq_by_num(0).tostring() == letters
assert alignment.get_seq_by_num(1).tostring() == letters.lower()
assert alignment.get_seq_by_num(2).tostring() == letters.upper()
assert alignment.get_all_seqs()[0].description == "mixed"
assert alignment.get_all_seqs()[1].description == "lower"
assert alignment.get_all_seqs()[2].description == "upper"
for (col, letter) in enumerate(letters) :
    assert alignment.get_column(col) == letter \
                                      + letter.lower() \
                                      + letter.upper()
#Check row extractions:
Ejemplo n.º 30
0
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step,
                   coverage, stars, ngroups, nhaplo):
    """Get haplotypes from contigs in an ace file
    
    """
    marker_number = 0
    min_freq = 0.05
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        with open(out_bamova, "w") as bamova_file:
            output_file.write("Contig_nb\tWindow\tHaplotype\n")
            contig_counter = 0
            ntreated = 0
            for contig in ace_gen:
                pass_haplo = False
                contig_counter += 1
                align = Alignment(Gapped(IUPAC.ambiguous_dna, "X"))
                align.add_sequence(contig.name, contig.sequence)
                if len(contig.reads) -1 < coverage:
                    continue
                ntreated += 1
                for readn in xrange(len(contig.reads)):
                    clipst = contig.reads[readn].qa.qual_clipping_start
                    clipe = contig.reads[readn].qa.qual_clipping_end
                    clipst2 = contig.reads[readn].qa.align_clipping_start
                    clipe2 = contig.reads[readn].qa.align_clipping_end
                    if clipst2 > clipst:
                        clipst = clipst2
                    if clipe2 < clipe2:
                        clipe = clipe2
                    start = contig.af[readn].padded_start
                    seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                    seq = pad_read(seq, start, len(contig.sequence))
                    if "pseudo" not in contig.reads[readn].rd.name:
                        align.add_sequence(contig.reads[readn].rd.name, seq)
                sequences = read_fasta(align.format("fasta"))
                sequences = [[s[0].replace(">", ""), s[1]] for s in sequences]
                contig_name = sequences[0][0]
                concensus = sequences[0][1]
                error_positions = multi_find("*", concensus)[::-1]
                for p in error_positions:
                    sequences = [[s[0], s[1][0:p] + s[1][p+1:]] for s in sequences]
                concensus = sequences[0][1]
                sequences = [[s[0], correct_sequence(concensus, s[1])]
                             for s in sequences[1:]]
                sequences, snp_pos = snp_positions(sequences)
                haplotypes = best_snps(sequences, snp_pos, coverage)
                if haplotypes != "Empty":
                    bamova = []
                    variants = list(sorted(list(set([h[-1] for h in haplotypes[-1]]))))
                    groups = list(sorted(set([h[0][:3] for h in haplotypes[-1]])))
                    if len(groups) >= ngroups:
                        pass_haplo = True
                        for g in groups:
                            if len([h[0] for h in haplotypes[-1] if h[0].startswith(g)]) < nhaplo:
                                pass_haplo = False
                    if pass_haplo:
                        print contig.name
                        bamova_file.write("Marker" + str(marker_number) + "\n")
                        group_number = 0
                        for g in groups:
                            bamova_file.write("Population\t" + str(group_number))
                            group_number += 1
                            for v in variants:
                                bamova_file.write("\t" + str(len([h for h in haplotypes[-1]
                                                  if h[-1] == v and h[0].startswith(g)])))
                            bamova_file.write("\n")
                        with open ("fasta_output/" + contig.name + ".fasta", "w") as f:
                            output_file.write(contig.name + "\n")
                            for h in haplotypes[-1]:
                                f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n")
                                h[1] = [x - h[1][0] + 1 for x in h[1]]
                                output_file.write("Marker" + str(marker_number) + "\t" +
                                                  "\t".join([str(x) for x in h]) + "\t" +
                                                  ":".join(variants) + "\n")
                        marker_number += 1
                output_file.flush()
                bamova_file.flush()
                cutoff = 100000
                if contig_counter > cutoff:
                    break
        print "\n", str(ntreated), "contigs out of", str(contig_counter), "were treated"
Ejemplo n.º 31
0
    def next(self):
        handle = self.handle
        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            return None

        #Whitelisted headers we know about
        known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE']
        if line.strip().split()[0] not in known_headers:
            raise ValueError("%s is not a known CLUSTAL header: %s" % \
                             (line.strip().split()[0],
                              ", ".join(known_headers)))

        # find the clustal version in the header line
        version = None
        for word in line.split():
            if word[0] == '(' and word[-1] == ')':
                word = word[1:-1]
            if word[0] in '0123456789':
                version = word
                break

        #There should be two blank lines after the header line
        line = handle.readline()
        while line.strip() == "":
            line = handle.readline()

        #If the alignment contains entries with the same sequence
        #identifier (not a good idea - but seems possible), then this
        #dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []
        consensus = ""
        seq_cols = None  #: Used to extract the consensus

        #Use the first block to get the sequence identifiers
        while True:
            if line[0] != " " and line.strip() != "":
                #Sequences identifier...
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                ids.append(fields[0])
                seqs.append(fields[1])

                #Record the sequence position to get the consensus
                if seq_cols is None:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end
                assert fields[1] == line[seq_cols]

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(fields[1].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
            elif line[0] == " ":
                #Sequence consensus line...
                assert len(ids) == len(seqs)
                assert len(ids) > 0
                assert seq_cols is not None
                consensus = line[seq_cols]
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Check for blank line (or end of file)
                line = handle.readline()
                assert line.strip() == ""
                break
            else:
                #No consensus
                break
            line = handle.readline()
            if not line: break  #end of file

        assert line.strip() == ""
        assert seq_cols is not None

        #Confirm all same length
        for s in seqs:
            assert len(s) == len(seqs[0])
        if consensus:
            assert len(consensus) == len(seqs[0])

        #Loop over any remaining blocks...
        done = False
        while not done:
            #There should be a blank line between each block.
            #Also want to ignore any consensus line from the
            #previous block.
            while (not line) or line.strip() == "":
                line = handle.readline()
                if not line: break  # end of file
            if not line: break  # end of file

            if line.split(None, 1)[0] in known_headers:
                #Found concatenated alignment.
                done = True
                self._header = line
                break

            for i in range(len(ids)):
                assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % repr(line))

                if fields[0] != ids[i]:
                    raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \
                                      % (fields[0], ids[i]))

                if fields[1] != line[seq_cols]:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    assert start == seq_cols.start, 'Old location %s -> %i:XX' % (
                        seq_cols, start)
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end

                #Append the sequence
                seqs[i] += fields[1]
                assert len(seqs[i]) == len(seqs[0])

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(seqs[i].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)

                #Read in the next line
                line = handle.readline()
            #There should now be a consensus line
            if consensus:
                assert line[0] == " "
                assert seq_cols is not None
                consensus += line[seq_cols]
                assert len(consensus) == len(seqs[0])
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Read in the next line
                line = handle.readline()

        assert len(ids) == len(seqs)
        if len(seqs) == 0 or len(seqs[0]) == 0:
            return None

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids) :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (len(ids), self.records_per_alignment))

        alignment = Alignment(self.alphabet)
        alignment_length = len(seqs[0])
        for i in range(len(ids)):
            if len(seqs[i]) != alignment_length:
                raise ValueError(
                    "Error parsing alignment - sequences of different length?")
            alignment.add_sequence(ids[i], seqs[i])
        #TODO - Handle alignment annotation better, for now
        #mimic the old parser in Bio.Clustalw
        if version:
            alignment._version = version
        if consensus:
            assert len(consensus) == alignment_length, \
                   "Alignment length is %i, consensus length is %i, '%s'" \
                   % (alignment_length, len(consensus), consensus)
            alignment._star_info = consensus
        return alignment
def main():
    # Configuration
    #Select the desired NCBI translation table
    translationTable = 11

    # Open the DNA sequence file and read the fasta sequences into a dictionary
    if (len(argv) > 1):
        dnaFileName = argv[1]
    else:
        dnaFileName = None
    dnaSeqFile = fileinput.input(dnaFileName)
    dnaSeqDict = SeqIO.to_dict(SeqIO.parse(dnaSeqFile, "fasta"))

    # Translate the sequences
    aaSeqRecords = []
    for key in dnaSeqDict:
        aaSeq = SeqRecord(dnaSeqDict[key].seq.translate(table=translationTable), id=key)
        aaSeqRecords.append(aaSeq)
    dnaSeqFile.close()

    # Replace stop codons with X (unknown aa) so muscle doesn't drop them
    for aaSeq in aaSeqRecords:
        noStopCodonSeq = str(aaSeq.seq).replace('*', 'X')
        aaSeq.seq = Seq(noStopCodonSeq)

    # Align the aa sequences
    commandLine = str(MuscleCommandline(seqtype='protein'))
    childProcess = subprocess.Popen(commandLine, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=(sys.platform!="win32")) #don't pipe stderr or muscle hangs
    SeqIO.write(aaSeqRecords, childProcess.stdin, "fasta")
    childProcess.stdin.close()
    aaAlignment = AlignIO.read(childProcess.stdout, "fasta")

    # Convert the aa alignment into a dna alignment
    dnaAlignment = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
    for taxon in aaAlignment:
        aaCount = 0
        dnaSeq = ''
        for aaResidue in taxon.seq:
            if (aaResidue == '-'):
                dnaSeq = dnaSeq + '---'
            else:
                dnaSeq = dnaSeq + dnaSeqDict[taxon.id].seq[aaCount*3:aaCount*3+3]
                aaCount+=1
        # As we add the sequences to the alignment remove gene name from the sequence id so they taxon match the PAML constraint tree
        dnaAlignment.add_sequence(taxon.id.split('_')[0], str(dnaSeq))
    if (dnaFileName):
        outFileName = dnaFileName.split('.')[0] + '_aln.phy'
    else:
        outFileName = 'out_aln.phy'
    outFile = open(outFileName, 'w+')
    AlignIO.write([dnaAlignment], outFile, "phylip")

#I think this section should be removed.  If I put the 'I' into the alignment file now, I can't open the alignment with BioPython-based scripts (for manual editing etc).  I can use pamlize.py to add the I right before using paml.
    # Biopython doesn't tag Interleaved phylip files and PAML requires it so...
#    outFile.seek(0,0)
#    modifiedAlignmentText = outFile.readlines()
#    modifiedAlignmentText[0] = modifiedAlignmentText[0].rstrip() + ' I\n'
#    outFile.seek(0,0)
#    outFile.writelines(modifiedAlignmentText)

    outFile.close()
Ejemplo n.º 33
0
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage,
                   stars, ngroups, nhaplo):
    """Get haplotypes from contigs in an ace file
    
    """
    marker_number = 0
    min_freq = 0.05
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        with open(out_bamova, "w") as bamova_file:
            output_file.write("Contig_nb\tWindow\tHaplotype\n")
            contig_counter = 0
            ntreated = 0
            for contig in ace_gen:
                pass_haplo = False
                contig_counter += 1
                align = Alignment(Gapped(IUPAC.ambiguous_dna, "X"))
                align.add_sequence(contig.name, contig.sequence)
                if len(contig.reads) - 1 < coverage:
                    continue
                ntreated += 1
                for readn in xrange(len(contig.reads)):
                    clipst = contig.reads[readn].qa.qual_clipping_start
                    clipe = contig.reads[readn].qa.qual_clipping_end
                    clipst2 = contig.reads[readn].qa.align_clipping_start
                    clipe2 = contig.reads[readn].qa.align_clipping_end
                    if clipst2 > clipst:
                        clipst = clipst2
                    if clipe2 < clipe2:
                        clipe = clipe2
                    start = contig.af[readn].padded_start
                    seq = cut_ends(contig.reads[readn].rd.sequence, clipst,
                                   clipe)
                    seq = pad_read(seq, start, len(contig.sequence))
                    if "pseudo" not in contig.reads[readn].rd.name:
                        align.add_sequence(contig.reads[readn].rd.name, seq)
                sequences = read_fasta(align.format("fasta"))
                sequences = [[s[0].replace(">", ""), s[1]] for s in sequences]
                contig_name = sequences[0][0]
                concensus = sequences[0][1]
                error_positions = multi_find("*", concensus)[::-1]
                for p in error_positions:
                    sequences = [[s[0], s[1][0:p] + s[1][p + 1:]]
                                 for s in sequences]
                concensus = sequences[0][1]
                sequences = [[s[0], correct_sequence(concensus, s[1])]
                             for s in sequences[1:]]
                sequences, snp_pos = snp_positions(sequences)
                haplotypes = best_snps(sequences, snp_pos, coverage)
                if haplotypes != "Empty":
                    bamova = []
                    variants = list(
                        sorted(list(set([h[-1] for h in haplotypes[-1]]))))
                    groups = list(
                        sorted(set([h[0][:3] for h in haplotypes[-1]])))
                    if len(groups) >= ngroups:
                        pass_haplo = True
                        for g in groups:
                            if len([
                                    h[0] for h in haplotypes[-1]
                                    if h[0].startswith(g)
                            ]) < nhaplo:
                                pass_haplo = False
                    if pass_haplo:
                        print contig.name
                        bamova_file.write("Marker" + str(marker_number) + "\n")
                        group_number = 0
                        for g in groups:
                            bamova_file.write("Population\t" +
                                              str(group_number))
                            group_number += 1
                            for v in variants:
                                bamova_file.write("\t" + str(
                                    len([
                                        h for h in haplotypes[-1]
                                        if h[-1] == v and h[0].startswith(g)
                                    ])))
                            bamova_file.write("\n")
                        with open("fasta_output/" + contig.name + ".fasta",
                                  "w") as f:
                            output_file.write(contig.name + "\n")
                            for h in haplotypes[-1]:
                                f.write(">" + h[0] + str(marker_number) +
                                        "\n" + h[2] + "\n")
                                h[1] = [x - h[1][0] + 1 for x in h[1]]
                                output_file.write(
                                    "Marker" + str(marker_number) + "\t" +
                                    "\t".join([str(x) for x in h]) + "\t" +
                                    ":".join(variants) + "\n")
                        marker_number += 1
                output_file.flush()
                bamova_file.flush()
                cutoff = 100000
                if contig_counter > cutoff:
                    break
        print "\n", str(ntreated), "contigs out of", str(
            contig_counter), "were treated"
Ejemplo n.º 34
0
def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars):
    """Genotype individuals at SNPs loci.
    
    """
    win_buffer = (win_len - 1) / 2
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start  # GOOD
                clipe = contig.reads[readn].qa.qual_clipping_end  # GOOD
                clipst2 = contig.reads[readn].qa.align_clipping_start  # Added
                clipe2 = contig.reads[readn].qa.align_clipping_end  # Added
                if clipst2 > clipst:  # Added
                    clipst = clipst2  # Added
                if clipe2 < clipe2:  # Added
                    clipe = clipe2  # Added
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            positions = []
            try:
                positions = snp_dict[contig_name]
            except:
                continue
            d = {}
            for pos in positions:
                if stars == True:
                    pos_ok = correct_position(pos, sequences[0][1])
                else:
                    pos_ok = pos
                left = pos_ok - 5
                if left < 0:
                    left = 0
                right = pos_ok + 1 + 5  # takes into account the middle nucleotide
                ref_window = sequences[0][1][left:right]
                d.setdefault(pos, {})
                d[pos].setdefault("XX_noTag", {})
                for nuc in list("ACGTN*-"):
                    d[pos]["XX_noTag"].setdefault(nuc, 0)
                for tag in tags:
                    d[pos].setdefault(tag, {})
                    for nuc in list("ACGTN*-"):
                        d[pos][tag].setdefault(nuc, 0)
                for fasta in sequences:
                    window = fasta[1][left:right]
                    del_count = 0
                    if window.count("-") > win_buffer - 3:
                        continue  # Need at least 3 nucleotides on each side
                    for tag in tags:
                        if tag in fasta[0]:
                            t = tag
                            break
                        else:
                            t = "XX_noTag"
                    if len(ref_window) == len(window):
                        for i in xrange(len(window)):
                            if ref_window[i].isalpha() and window[i] == "*" or \
                               window[i].isalpha() and ref_window[i] == "*":
                                del_count += 1
                    if del_count > max_del:
                        continue
                    p = pos
                    s = fasta[1]  # Sequence
                    n = s[pos_ok - 1].upper()
                    d[p][t][n] += 1
            for p in sorted(d):
                for t in sorted(d[p]):
                    output_file.write(contig_name + "\t" + str(p) + "\t" +
                                      str(t))
                    for n in list("ACGTN*-"):
                        output_file.write("\t" + str(d[p][t][n]))
                    output_file.write("\n")
Ejemplo n.º 35
0
def pairwise(in_ace, out_file):
    """Calculate pairwise differentiation indexes.
    
    """
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            window_len = 8 # PARAMETER
            max_diff = 3 # PARAMETER
            len_contig = len(sequences[0][1])
            number_indexes = 0
            total_indexes = 0
            for seq in sequences[1:]:
                try:
                    start = len(re.findall("^-+", seq[1])[0])
                except:
                    start = 0
                len_seq = 0
                min_len_seq = 100 # PARAMETER
                count = 0
                for window in range(start, len_contig, window_len):
                    nuc_contig = sequences[0][1][window:window + window_len]
                    nuc_seq = seq[1][window:window + window_len]
                    if "-" in nuc_seq:
                        len_seq += len(nuc_seq.replace("-", ""))
                    else:
                        diff = count_diff(nuc_contig, nuc_seq, max_diff)
                        if diff[1] == False:
                            count += diff[0]
                            len_seq += window_len
                len_seq -= seq.count("*")
                if len_seq >= min_len_seq:
                    index = float(count) / len_seq
                    if count > 0:
                        number_indexes +=1
                        total_indexes += index
                else:
                    index = "NA"
                #output_file.write(contig_name + "\t" + str(index) + "\n")
            try:
                mean_index = float(total_indexes) / number_indexes
            except:
                mean_index = "NA"
            output_file.write(contig_name + "\t" + str(mean_index) + "\n")
Ejemplo n.º 36
0
def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars):
    """Genotype individuals at SNPs loci.
    
    """
    win_buffer = (win_len - 1) / 2
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start # GOOD
                clipe = contig.reads[readn].qa.qual_clipping_end # GOOD
                clipst2 = contig.reads[readn].qa.align_clipping_start # Added
                clipe2 = contig.reads[readn].qa.align_clipping_end # Added
                if clipst2 > clipst: # Added
                    clipst = clipst2 # Added
                if clipe2 < clipe2: # Added
                    clipe = clipe2 # Added
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            positions = []
            try:
                positions = snp_dict[contig_name]
            except:
                continue
            d = {}
            for pos in positions:
                if stars == True:
                    pos_ok = correct_position(pos, sequences[0][1])
                else:
                    pos_ok = pos
                left = pos_ok - 5
                if left < 0:
                    left = 0
                right = pos_ok + 1 + 5 # takes into account the middle nucleotide
                ref_window = sequences[0][1][left:right]
                d.setdefault(pos, {})
                d[pos].setdefault("XX_noTag", {})
                for nuc in list("ACGTN*-"):
                    d[pos]["XX_noTag"].setdefault(nuc, 0)
                for tag in tags:
                    d[pos].setdefault(tag, {})
                    for nuc in list("ACGTN*-"):
                        d[pos][tag].setdefault(nuc, 0)
                for fasta in sequences:
                    window = fasta[1][left:right]
                    del_count = 0
                    if window.count("-") > win_buffer - 3:
                        continue # Need at least 3 nucleotides on each side
                    for tag in tags:
                        if tag in fasta[0]:
                            t = tag
                            break
                        else:
                            t = "XX_noTag"
                    if len(ref_window) == len(window):
                        for i in xrange(len(window)):
                            if ref_window[i].isalpha() and window[i] == "*" or \
                               window[i].isalpha() and ref_window[i] == "*":
                                del_count += 1
                    if del_count > max_del:
                        continue
                    p = pos
                    s = fasta[1] # Sequence
                    n = s[pos_ok - 1].upper()
                    d[p][t][n] += 1
            for p in sorted(d):
                for t in sorted(d[p]):
                    output_file.write(contig_name + "\t" + str(p) + "\t" + 
                                      str(t))
                    for n in list("ACGTN*-"):
                        output_file.write("\t" + str(d[p][t][n]))
                    output_file.write("\n")
Ejemplo n.º 37
0
#	seq: Seq object, required
#additional attributes
#	name, description: name and more info of sequence
#	dbxrefs: list of strings, each string an id of a DB
#	features: list of SeqFeature objects, those found in Genbank records
#	annotations: dictionary with further info, can't be set on initialization
seqrec=SeqRecord(Seq('mdstnvrsgmksrkkkpkttvidddddcmtcsacqsklvkisditkvsldyintmrgntlacaacgsslkllndfas',Bio.Alphabet.generic_protein), id='P20994.1', name='P20994', description='Protein A19', dbxrefs=['Pfam:PF05077', 'InterPro:IPR007769', 'DIP:2186N'])
seqrec.annotations['note']='A simple note'
print seqrec

#tipo de dato alineamiento de secuencias, guarda no procesa
from Bio.Align.Generic import Alignment
seq1='MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW'
seq2='MH--IFIYQIGYALKSGYIQSIRSPEY-NW'
align=Alignment(Bio.Alphabet.Gapped(IUPAC.protein))	#instance of Alignment class
align.add_sequence('asp',seq1)
align.add_sequence('unk',seq2)
print align
#Alignment methods
#get_all_seqs:	return all sequences in the alignment as a list of SeqRecord
for s in align.get_all_seqs():	#in align: (the same)
	print '->',s.seq
#get_seq_by_num(n): return only the selected sequence by index
print str(align.get_seq_by_num(1))	#Seq object
print align[0]	#SeqRecord object
print str(align[0].seq)
#get_alignment_length(): get length of alignment
print align.get_alignment_length()
#get_column(n): return a string with all the letters in the n column
print align.get_column(0)
print align.get_column(2)
Ejemplo n.º 38
0
    consensus = summary.gap_consensus(ambiguous="N")
    print(consensus)
    print("")
    print(summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                            axis_seq=consensus))
    print("")
    # Have a generic alphabet, without a declared gap char, so must tell
    # provide the frequencies and chars to ignore explicitly.
    print(summary.information_content(e_freq_table=expected,
                                      chars_to_ignore=['-']))
    print("")
    print("Trying a protein sequence with gaps and stops")

    alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
    a = Alignment(alpha)
    a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
    a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
    a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
    print(a)
    print("=" * a.get_alignment_length())

    s = SummaryInfo(a)
    c = s.dumb_consensus(ambiguous="X")
    print(c)
    c = s.gap_consensus(ambiguous="X")
    print(c)
    print("")
    print(s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c))

    print(s.information_content(chars_to_ignore=['-', '*']))
Ejemplo n.º 39
0
from Bio import Clustalw
from Bio.Align import AlignInfo
from Bio import AlignIO
from Bio.SubsMat import FreqTable
from Bio.Align.Generic import Alignment

#Very simple tests on an empty alignment
alignment = Alignment(Alphabet.generic_alphabet)
assert alignment.get_alignment_length() == 0
assert len(alignment) == 0
del alignment

#Basic tests on simple three string alignment
alignment = Alignment(Alphabet.generic_alphabet)
letters = "AbcDefGhiJklMnoPqrStuVwxYz"
alignment.add_sequence("mixed", letters)
alignment.add_sequence("lower", letters.lower())
alignment.add_sequence("upper", letters.upper())
assert alignment.get_alignment_length() == 26
assert len(alignment) == 3
assert alignment.get_seq_by_num(0).tostring() == letters
assert alignment.get_seq_by_num(1).tostring() == letters.lower()
assert alignment.get_seq_by_num(2).tostring() == letters.upper()
assert alignment[0].description == "mixed"
assert alignment[1].description == "lower"
assert alignment[2].description == "upper"
for (col, letter) in enumerate(letters):
    assert alignment.get_column(col) == letter \
                                      + letter.lower() \
                                      + letter.upper()
#Check row extractions:
Ejemplo n.º 40
0
    def next(self):
        handle = self.handle
        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:      
            line = handle.readline()
        if not line:
            return None

        #Whitelisted headers we know about
        known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE']
        if line.strip().split()[0] not in known_headers:
            raise ValueError("%s is not a known CLUSTAL header: %s" % \
                             (line.strip().split()[0],
                              ", ".join(known_headers)))

        # find the clustal version in the header line
        version = None
        for word in line.split():
            if word[0]=='(' and word[-1]==')':
                word = word[1:-1]
            if word[0] in '0123456789':
                version = word
                break

        #There should be two blank lines after the header line
        line = handle.readline()
        while line.strip() == "":
            line = handle.readline()

        #If the alignment contains entries with the same sequence
        #identifier (not a good idea - but seems possible), then this
        #dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []
        consensus = ""
        seq_cols = None #: Used to extract the consensus

        #Use the first block to get the sequence identifiers
        while True:
            if line[0] != " " and line.strip() != "":
                #Sequences identifier...
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                ids.append(fields[0])
                seqs.append(fields[1])

                #Record the sequence position to get the consensus
                if seq_cols is None:
                    start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end
                assert fields[1] == line[seq_cols]

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
                    if len(fields[1].replace("-","")) != letters:
                        raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
            elif line[0] == " ":
                #Sequence consensus line...
                assert len(ids) == len(seqs)
                assert len(ids) > 0
                assert seq_cols is not None
                consensus = line[seq_cols]
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Check for blank line (or end of file)
                line = handle.readline()
                assert line.strip() == ""
                break
            else:
                #No consensus
                break
            line = handle.readline()
            if not line : break #end of file

        assert line.strip() == ""
        assert seq_cols is not None

        #Confirm all same length
        for s in seqs:
            assert len(s) == len(seqs[0])
        if consensus:
            assert len(consensus) == len(seqs[0])

        #Loop over any remaining blocks...
        done = False
        while not done:
            #There should be a blank line between each block.
            #Also want to ignore any consensus line from the
            #previous block.
            while (not line) or line.strip() == "":
                line = handle.readline()
                if not line : break # end of file
            if not line : break # end of file

            if line.split(None,1)[0] in known_headers:
                #Found concatenated alignment.
                done = True
                self._header = line
                break

            for i in range(len(ids)):
                assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
                fields = line.rstrip().split()
                
                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % repr(line))

                if fields[0] != ids[i]:
                    raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \
                                      % (fields[0], ids[i]))

                if fields[1] != line[seq_cols]:
                    start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
                    assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start)
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end

                #Append the sequence
                seqs[i] += fields[1]
                assert len(seqs[i]) == len(seqs[0])

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
                    if len(seqs[i].replace("-","")) != letters:
                        raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)

                #Read in the next line
                line = handle.readline()
            #There should now be a consensus line
            if consensus:
                assert line[0] == " "
                assert seq_cols is not None
                consensus += line[seq_cols]
                assert len(consensus) == len(seqs[0])
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Read in the next line
                line = handle.readline()
            

        assert len(ids) == len(seqs)
        if len(seqs) == 0 or len(seqs[0]) == 0:
            return None

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (len(ids), self.records_per_alignment))

        alignment = Alignment(self.alphabet)
        alignment_length = len(seqs[0])
        for i in range(len(ids)):
            if len(seqs[i]) != alignment_length:
                raise ValueError("Error parsing alignment - sequences of different length?")
            alignment.add_sequence(ids[i], seqs[i])
        #TODO - Handle alignment annotation better, for now
        #mimic the old parser in Bio.Clustalw
        if version:
            alignment._version = version
        if consensus:
            assert len(consensus) == alignment_length, \
                   "Alignment length is %i, consensus length is %i, '%s'" \
                   % (alignment_length, len(consensus), consensus)
            alignment._star_info = consensus
        return alignment
Ejemplo n.º 41
0
    def next(self) :
        """Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an Alignment object containing two rows."""

        handle = self.handle
        try :
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            print self._header.strip(), '--> self_header'
            del self._header
        except AttributeError:      
            line = handle.readline()
        if not line:
            return None

        if line.startswith('#-') :
            #Reached the end of the alignments, no need to read the footer...
            return None
        if line.startswith("##") :
            #Skip the file header before the alignments.  e.g.
#            print line.strip()
            line = self._skip_file_header(line)
#        print 'Back from file header skip'
        assert line.startswith('#'), line

        while not line.startswith('#=') :
            line = self.handle.readline()

        if line.startswith('#='):
            #Moved onto the next query sequence!
            self._query_descr = ""
            self._query_header_annotation = {}
            #Read in the query header
            line = self._parse_query_header(line)
        if not line :
            #End of file
            return None

        
        assert line.startswith(">>") and not line.startswith(">>>"), line

        query_seq_parts, match_seq_parts = [], []
        query_annotation, match_annotation = {}, {}
        match_descr = ""
        alignment_annotation = {}

        #This should be followed by the target match numbering line, then more tags.
        #e.g.
        """
        >>#2
        ; sw_score: 41.0
        ; sw_ident: 0.846
        ; sw_overlap: 13
        """
        
        if not line.startswith(">>") and not line.startswith(">>>") :
            raise ValueError("Expected target line starting '>>'")
        match_descr = line[2:].strip()
        #print match_descr, 'match'
        #Handle the following "alignment hit" tagged data, e.g.
        line = handle.readline()
        line = self._parse_tag_section(line, alignment_annotation)
        assert not line.startswith("; ")

        #Then we have the alignment numbers and sequence for the query
        """
        >gi|10955265| ..
        ; sq_len: 346
        ; sq_offset: 1
        ; sq_type: p
        ; al_start: 197
        ; al_stop: 238
        ; al_display_start: 167
        DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
        QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
        GEYFTENKPKYIIREIHQET
        """
        if not (line.startswith(">") and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..'")
        assert self._query_descr.startswith(line[1:].split()[0])
        
        #Handle the following "query alignment" tagged data
        line = handle.readline()
        line = self._parse_tag_section(line, query_annotation)
        assert not line.startswith("; ")

        #Now should have the aligned query sequence (with leading flanking region)
        while not line.startswith(">") :
            query_seq_parts.append(line.strip())
            line = handle.readline()
#            print 'queryseq', line.strip()
        #Handle the following "match alignment" data
        """
        >gi|152973545|ref|YP_001338596.1| ..
        ; sq_len: 242
        ; sq_type: p
        ; al_start: 52
        ; al_stop: 94
        ; al_display_start: 22
        IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
        RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
        QDFAFTRKMRREARQVEQSW
        """
        #Match identifier
        if not (line.startswith(">") and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
        #print '----->', line.strip(), match_descr
        match_descr = line[1:].split()[0] + match_descr
        
        #assert match_descr.startswith(line[1:].split()[0])
#        assert self._match_descr.startswith(line[1:].split()[0])

        #Tagged data,
        line = handle.readline()
        line = self._parse_tag_section(line, match_annotation)
        assert not line.startswith("; ")
        
        #Now should have the aligned query sequence with flanking region...
        while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'):
            match_seq_parts.append(line.strip())
            line = handle.readline()
            if not line:
                #End of file
                return None
        if line.startswith('>') or '>>>' in line:
            self._header = line

        #We built a list of strings and then joined them because
        #its faster than appending to a string.
        query_seq = "".join(query_seq_parts)
        match_seq = "".join(match_seq_parts)
        del query_seq_parts, match_seq_parts
        #Note, query_seq and match_seq will usually be of different lengths, apparently
        #because in the m10 format leading gaps are added but not trailing gaps!

        #Remove the flanking regions,
        query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
        match_align_seq = self._extract_alignment_region(match_seq, match_annotation)

        #The "sq_offset" values can be specified with the -X command line option.
        #The appear to just shift the origin used in the calculation of the coordinates.
        
        if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \
        or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") :
            #Note that until some point in the v35 series, FASTA always recorded one
            #for the query offset, and ommitted the match offset (even when these were
            #query_seq the -X command line option).
            #TODO - Work out how exactly the use of -X offsets changes things.
            #raise ValueError("Offsets from the -X command line option are not (yet) supported")
            pass

# this is not useful when using stretcher
#        if len(query_align_seq) != len(match_align_seq) :
#            raise ValueError("Problem parsing the alignment sequence coordinates")
        if "sw_overlap" in alignment_annotation :
            if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) :
                raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
                                 % (alignment_annotation["sw_overlap"],
                                    len(query_align_seq)))

        #TODO - Look at the "sq_type" to assign a sensible alphabet?
        alignment = Alignment(self.alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}
        
        #Want to record both the query header tags, and the alignment tags.
        for key, value in self._query_header_annotation.iteritems() :
            alignment._annotations[key] = value
        for key, value in alignment_annotation.iteritems() :
            alignment._annotations[key] = value
            

        #TODO - Once the alignment object gets an append method, use it.
        #(i.e. an add SeqRecord method)
        alignment.add_sequence(self._query_descr, query_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == self._query_descr or record.description == self._query_descr
        assert record.seq.tostring() == query_align_seq
        record.id = self._query_descr.split()[0].strip(",")
        record.name = "query"
        record.annotations["original_length"] = int(query_annotation["sq_len"])
        # Roba mia
        for k in query_annotation.keys():
            record.annotations[k] = query_annotation[k]

        alignment.add_sequence(match_descr, match_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == match_descr or record.description == match_descr
        assert record.seq.tostring() == match_align_seq
        record.id = match_descr.split()[0].strip(",")
        record.name = "match"
        record.annotations["original_length"] = int(match_annotation["sq_len"])
        # Roba mia
        for k in query_annotation.keys():
            record.annotations[k] = match_annotation[k]

        return alignment
Ejemplo n.º 42
0
    def next(self) :
        handle = self.handle

        try :
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError :
            line = handle.readline()

        if not line: return
        line = line.strip()
        parts = filter(None, line.split())
        if len(parts)!=2 :
            raise ValueError("First line should have two integers")
        try :
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        #Expects STRICT truncation/padding to 10 characters
        #Does not require any white space between name and seq.
        for i in range(0,number_of_seqs) :
            line = handle.readline().rstrip()
            ids.append(line[:10].strip()) #first ten characters
            seqs.append([line[10:].strip().replace(" ","")])

        #Look for further blocks
        line=""
        while True :
            #Skip any blank lines between blocks...
            while ""==line.strip():
                line = handle.readline()
                if not line : break #end of file
            if not line : break #end of file

            if self._is_header(line) :
                #Looks like the start of a concatenated alignment
                self._header = line
                break

            #print "New block..."
            for i in range(0,number_of_seqs) :
                seqs[i].append(line.strip().replace(" ",""))
                line = handle.readline()
                if (not line) and i+1 < number_of_seqs :
                    raise ValueError("End of file mid-block")
            if not line : break #end of file

        alignment = Alignment(self.alphabet)
        for i in range(0,number_of_seqs) :
            seq = "".join(seqs[i])
            if len(seq)!=length_of_seqs :
                raise ValueError("Sequence %i length %i, expected length %i" \
                                  % (i+1, len(seq), length_of_seqs))
            alignment.add_sequence(ids[i], seq)
            
            record = alignment.get_all_seqs()[-1]
            assert ids[i] == record.id or ids[i] == record.description
            record.id = ids[i]
            record.name = ids[i]
            record.description = ids[i]
        return alignment
Ejemplo n.º 43
0
    def next(self):
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()

        if not line: return
        line = line.strip()
        parts = filter(None, line.split())
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        #Expects STRICT truncation/padding to 10 characters
        #Does not require any white space between name and seq.
        for i in range(0, number_of_seqs):
            line = handle.readline().rstrip()
            ids.append(line[:10].strip())  #first ten characters
            seqs.append([line[10:].strip().replace(" ", "")])

        #Look for further blocks
        line = ""
        while True:
            #Skip any blank lines between blocks...
            while "" == line.strip():
                line = handle.readline()
                if not line: break  #end of file
            if not line: break  #end of file

            if self._is_header(line):
                #Looks like the start of a concatenated alignment
                self._header = line
                break

            #print "New block..."
            for i in range(0, number_of_seqs):
                seqs[i].append(line.strip().replace(" ", ""))
                line = handle.readline()
                if (not line) and i + 1 < number_of_seqs:
                    raise ValueError("End of file mid-block")
            if not line: break  #end of file

        alignment = Alignment(self.alphabet)
        for i in range(0, number_of_seqs):
            seq = "".join(seqs[i])
            if len(seq) != length_of_seqs:
                raise ValueError("Sequence %i length %i, expected length %i" \
                                  % (i+1, len(seq), length_of_seqs))
            alignment.add_sequence(ids[i], seq)

            record = alignment.get_all_seqs()[-1]
            assert ids[i] == record.id or ids[i] == record.description
            record.id = ids[i]
            record.name = ids[i]
            record.description = ids[i]
        return alignment
Ejemplo n.º 44
0
def pairwise(in_ace, out_file):
    """Calculate pairwise differentiation indexes.
    
    """
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            window_len = 8  # PARAMETER
            max_diff = 3  # PARAMETER
            len_contig = len(sequences[0][1])
            number_indexes = 0
            total_indexes = 0
            for seq in sequences[1:]:
                try:
                    start = len(re.findall("^-+", seq[1])[0])
                except:
                    start = 0
                len_seq = 0
                min_len_seq = 100  # PARAMETER
                count = 0
                for window in range(start, len_contig, window_len):
                    nuc_contig = sequences[0][1][window:window + window_len]
                    nuc_seq = seq[1][window:window + window_len]
                    if "-" in nuc_seq:
                        len_seq += len(nuc_seq.replace("-", ""))
                    else:
                        diff = count_diff(nuc_contig, nuc_seq, max_diff)
                        if diff[1] == False:
                            count += diff[0]
                            len_seq += window_len
                len_seq -= seq.count("*")
                if len_seq >= min_len_seq:
                    index = float(count) / len_seq
                    if count > 0:
                        number_indexes += 1
                        total_indexes += index
                else:
                    index = "NA"
                #output_file.write(contig_name + "\t" + str(index) + "\n")
            try:
                mean_index = float(total_indexes) / number_indexes
            except:
                mean_index = "NA"
            output_file.write(contig_name + "\t" + str(mean_index) + "\n")
Ejemplo n.º 45
0
    print consensus
    print
    print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                            axis_seq=consensus)
    print
    #Have a generic alphabet, without a declared gap char, so must tell
    #provide the frequencies and chars to ignore explicitly.
    print summary.information_content(e_freq_table=expected,
                                      chars_to_ignore=['-'])
    print
    print "Trying a protein sequence with gaps and stops"

    alpha = Alphabet.HasStopCodon(
        Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
    a = Alignment(alpha)
    a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
    a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
    a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
    print a
    print "=" * a.get_alignment_length()

    s = SummaryInfo(a)
    c = s.dumb_consensus(ambiguous="X")
    print c
    c = s.gap_consensus(ambiguous="X")
    print c
    print
    print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)

    print s.information_content(chars_to_ignore=['-', '*'])
Ejemplo n.º 46
0
    def next(self):

        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            return None
        if line[:7] <> 'CLUSTAL':
            raise ValueError("Did not find CLUSTAL header")

        #There should be two blank lines after the header line
        line = handle.readline()
        while line.strip() == "":
            line = handle.readline()

        #If the alignment contains entries with the same sequence
        #identifier (not a good idea - but seems possible), then this
        #dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []

        #Use the first block to get the sequence identifiers
        while line.strip() <> "":
            if line[0] <> " ":
                #Sequences identifier...
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                ids.append(fields[0])
                seqs.append(fields[1])

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(fields[1].replace("-", "")) <> letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
            else:
                #Sequence consensus line...
                pass
            line = handle.readline()
            if not line: break  #end of file

        assert line.strip() == ""

        #Loop over any remaining blocks...
        done = False
        while not done:
            #There should be a blank line between each block.
            #Also want to ignore any consensus line from the
            #previous block.
            while (not line) or line.strip() == "" or line[0] == " ":
                line = handle.readline()
                if not line: break  # end of file
            if not line: break  # end of file

            for i in range(len(ids)):
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    if line[:7] == 'CLUSTAL':
                        #Found concatenated alignment.
                        done = True
                        self._header = line
                        break
                    else:
                        raise ValueError("Could not parse line:\n%s" % line)

                if fields[0] <> ids[i]:
                    raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \
                                      % (fields[0], ids[i]))

                #Append the sequence
                seqs[i] += fields[1]

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(seqs[i].replace("-", "")) <> letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)

                #Read in the next line
                line = handle.readline()

        assert len(ids) == len(seqs)
        if len(seqs) == 0 or len(seqs[0]) == 0:
            return None

        if self.records_per_alignment is not None \
        and self.records_per_alignment <> len(ids) :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (len(ids), self.records_per_alignment))

        alignment = Alignment(self.alphabet)
        alignment_length = len(seqs[0])
        for i in range(len(ids)):
            if len(seqs[i]) <> alignment_length:
                raise ValueError(
                    "Error parsing alignment - sequences of different length?")
            alignment.add_sequence(ids[i], seqs[i])
        return alignment
Ejemplo n.º 47
0
    def next(self) :
        """Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an Alignment object containing two rows."""

        handle = self.handle
        try :
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            print self._header.strip(), '--> self_header'
            del self._header
        except AttributeError:      
            line = handle.readline()
        if not line:
            return None

        if line.startswith('#-') :
            #Reached the end of the alignments, no need to read the footer...
            return None
        if line.startswith("##") :
            #Skip the file header before the alignments.  e.g.
#            print line.strip()
            line = self._skip_file_header(line)
#        print 'Back from file header skip'
        assert line.startswith('#'), line

        while not line.startswith('#=') :
            line = self.handle.readline()

        if line.startswith('#='):
            #Moved onto the next query sequence!
            self._query_descr = ""
            self._query_header_annotation = {}
            #Read in the query header
            line = self._parse_query_header(line)
        if not line :
            #End of file
            return None

        
        assert line.startswith(">>") and not line.startswith(">>>"), line

        query_seq_parts, match_seq_parts = [], []
        query_annotation, match_annotation = {}, {}
        match_descr = ""
        alignment_annotation = {}

        #This should be followed by the target match numbering line, then more tags.
        #e.g.
        """
        >>#2
        ; sw_score: 41.0
        ; sw_ident: 0.846
        ; sw_overlap: 13
        """
        
        if not line.startswith(">>") and not line.startswith(">>>") :
            raise ValueError("Expected target line starting '>>'")
        match_descr = line[2:].strip()
        #print match_descr, 'match'
        #Handle the following "alignment hit" tagged data, e.g.
        line = handle.readline()
        line = self._parse_tag_section(line, alignment_annotation)
        assert not line.startswith("; ")

        #Then we have the alignment numbers and sequence for the query
        """
        >gi|10955265| ..
        ; sq_len: 346
        ; sq_offset: 1
        ; sq_type: p
        ; al_start: 197
        ; al_stop: 238
        ; al_display_start: 167
        DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
        QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
        GEYFTENKPKYIIREIHQET
        """
        if not (line.startswith(">") and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..'")
        assert self._query_descr.startswith(line[1:].split()[0])
        
        #Handle the following "query alignment" tagged data
        line = handle.readline()
        line = self._parse_tag_section(line, query_annotation)
        assert not line.startswith("; ")

        #Now should have the aligned query sequence (with leading flanking region)
        while not line.startswith(">") :
            query_seq_parts.append(line.strip())
            line = handle.readline()
#            print 'queryseq', line.strip()
        #Handle the following "match alignment" data
        """
        >gi|152973545|ref|YP_001338596.1| ..
        ; sq_len: 242
        ; sq_type: p
        ; al_start: 52
        ; al_stop: 94
        ; al_display_start: 22
        IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
        RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
        QDFAFTRKMRREARQVEQSW
        """
        #Match identifier
        if not (line.startswith(">") and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
        #print '----->', line.strip(), match_descr
        match_descr = line[1:].split()[0] + match_descr
        
        #assert match_descr.startswith(line[1:].split()[0])
#        assert self._match_descr.startswith(line[1:].split()[0])

        #Tagged data,
        line = handle.readline()
        line = self._parse_tag_section(line, match_annotation)
        assert not line.startswith("; ")
        
        #Now should have the aligned query sequence with flanking region...
        while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'):
            match_seq_parts.append(line.strip())
            line = handle.readline()

        if line.startswith('>') or '>>>' in line:
            self._header = line

        #We built a list of strings and then joined them because
        #its faster than appending to a string.
        query_seq = "".join(query_seq_parts)
        match_seq = "".join(match_seq_parts)
        del query_seq_parts, match_seq_parts
        #Note, query_seq and match_seq will usually be of different lengths, apparently
        #because in the m10 format leading gaps are added but not trailing gaps!

        #Remove the flanking regions,
        query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
        match_align_seq = self._extract_alignment_region(match_seq, match_annotation)

        #The "sq_offset" values can be specified with the -X command line option.
        #The appear to just shift the origin used in the calculation of the coordinates.
        
        if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \
        or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") :
            #Note that until some point in the v35 series, FASTA always recorded one
            #for the query offset, and ommitted the match offset (even when these were
            #query_seq the -X command line option).
            #TODO - Work out how exactly the use of -X offsets changes things.
            #raise ValueError("Offsets from the -X command line option are not (yet) supported")
            pass

# this is not useful when using stretcher
#        if len(query_align_seq) != len(match_align_seq) :
#            raise ValueError("Problem parsing the alignment sequence coordinates")
        if "sw_overlap" in alignment_annotation :
            if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) :
                raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
                                 % (alignment_annotation["sw_overlap"],
                                    len(query_align_seq)))

        #TODO - Look at the "sq_type" to assign a sensible alphabet?
        alignment = Alignment(self.alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}
        
        #Want to record both the query header tags, and the alignment tags.
        for key, value in self._query_header_annotation.iteritems() :
            alignment._annotations[key] = value
        for key, value in alignment_annotation.iteritems() :
            alignment._annotations[key] = value
            

        #TODO - Once the alignment object gets an append method, use it.
        #(i.e. an add SeqRecord method)
        alignment.add_sequence(self._query_descr, query_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == self._query_descr or record.description == self._query_descr
        assert record.seq.tostring() == query_align_seq
        record.id = self._query_descr.split()[0].strip(",")
        record.name = "query"
        record.annotations["original_length"] = int(query_annotation["sq_len"])
        # Roba mia
        for k in query_annotation.keys():
            record.annotations[k] = query_annotation[k]

        alignment.add_sequence(match_descr, match_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == match_descr or record.description == match_descr
        assert record.seq.tostring() == match_align_seq
        record.id = match_descr.split()[0].strip(",")
        record.name = "match"
        record.annotations["original_length"] = int(match_annotation["sq_len"])
        # Roba mia
        for k in query_annotation.keys():
            record.annotations[k] = match_annotation[k]

        return alignment
Ejemplo n.º 48
0
    def next(self) :
        """Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an Alignment object containing two rows."""
        handle = self.handle

        try :
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:      
            line = handle.readline()
        if not line:
            return None

        if line.startswith("#") :
            #Skip the file header before the alignments.  e.g.
            line = self._skip_file_header(line)
        while ">>>" in line and not line.startswith(">>>") :
            #Moved onto the next query sequence!
            self._query_descr = ""
            self._query_header_annotation = {}
            #Read in the query header
            line = self._parse_query_header(line)
            #Now should be some alignments, but if not we move onto the next query
        if not line :
            #End of file
            return None
        if ">>><<<" in line :
            #Reached the end of the alignments, no need to read the footer...
            return None


        #Should start >>... and not >>>...
        assert line[0:2] == ">>" and not line[2] == ">", line

        query_seq_parts, match_seq_parts = [], []
        query_annotation, match_annotation = {}, {}
        match_descr = ""
        alignment_annotation = {}

        #This should be followed by the target match ID line, then more tags.
        #e.g.
        """
        >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
        ; fa_frame: f
        ; fa_initn:  52
        ; fa_init1:  52
        ; fa_opt:  70
        ; fa_z-score: 105.5
        ; fa_bits: 27.5
        ; fa_expect:  0.082
        ; sw_score: 70
        ; sw_ident: 0.279
        ; sw_sim: 0.651
        ; sw_overlap: 43
        """
        if (not line[0:2] == ">>") or line[0:3] == ">>>" :
            raise ValueError("Expected target line starting '>>'")
        match_descr = line[2:].strip()
        #Handle the following "alignment hit" tagged data, e.g.
        line = handle.readline()
        line = self._parse_tag_section(line, alignment_annotation)
        assert not line[0:2] == "; "
        
        #Then we have the alignment numbers and sequence for the query
        """
        >gi|10955265| ..
        ; sq_len: 346
        ; sq_offset: 1
        ; sq_type: p
        ; al_start: 197
        ; al_stop: 238
        ; al_display_start: 167
        DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
        QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
        GEYFTENKPKYIIREIHQET
        """
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..'")
        assert self._query_descr.startswith(line[1:].split(None,1)[0])
        
        #Handle the following "query alignment" tagged data
        line = handle.readline()
        line = self._parse_tag_section(line, query_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence (with leading flanking region)
        while not line[0] == ">" :
            query_seq_parts.append(line.strip())
            line = handle.readline()
        
        #Handle the following "match alignment" data
        """
        >gi|152973545|ref|YP_001338596.1| ..
        ; sq_len: 242
        ; sq_type: p
        ; al_start: 52
        ; al_stop: 94
        ; al_display_start: 22
        IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
        RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
        QDFAFTRKMRREARQVEQSW
        """
        #Match identifier
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
        assert match_descr.startswith(line[1:].split(None,1)[0])
        
        #Tagged data,
        line = handle.readline()
        line = self._parse_tag_section(line, match_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence with flanking region...
        #but before that, since FASTA 35.4.1 there can be an consensus here,
        """
        ; al_cons:
        .::. : :. ---.  :: :. . :  ..-:::-:  :.:  ..:...: 
        etc
        """
        while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
            match_seq_parts.append(line.strip())
            line = handle.readline()
        if line[0:2] == "; " :
            assert line.strip() == "; al_cons:"
            align_consensus_parts = []
            line = handle.readline()
            while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
                align_consensus_parts.append(line.strip())
                line = handle.readline()
            #If we do anything with this in future, must remove any flanking region.
            align_consensus = "".join(align_consensus_parts)
            del align_consensus_parts
            assert not line[0:2] == "; "
        else :
            align_consensus = None
        assert (line[0] == ">" or ">>>" in line)
        self._header = line

        #We built a list of strings and then joined them because
        #its faster than appending to a string.
        query_seq = "".join(query_seq_parts)
        match_seq = "".join(match_seq_parts)
        del query_seq_parts, match_seq_parts
        #Note, query_seq and match_seq will usually be of different lengths, apparently
        #because in the m10 format leading gaps are added but not trailing gaps!

        #Remove the flanking regions,
        query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
        match_align_seq = self._extract_alignment_region(match_seq, match_annotation)
        #How can we do this for the (optional) consensus?

        #The "sq_offset" values can be specified with the -X command line option.
        #They appear to just shift the origin used in the calculation of the coordinates.
        
        if len(query_align_seq) != len(match_align_seq) :
            raise ValueError("Problem parsing the alignment sequence coordinates, " 
                             "following should be the same length but are not:\n"
                             "%s - len %i\n%s - len %i" % (query_align_seq,
                                                           len(query_align_seq),
                                                           match_align_seq,
                                                           len(match_align_seq)))
        if "sw_overlap" in alignment_annotation :
            if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) :
                raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
                                 % (alignment_annotation["sw_overlap"],
                                    len(query_align_seq)))

        #TODO - Look at the "sq_type" to assign a sensible alphabet?
        alphabet = self.alphabet
        alignment = Alignment(alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}
        
        #Want to record both the query header tags, and the alignment tags.
        for key, value in self._query_header_annotation.iteritems() :
            alignment._annotations[key] = value
        for key, value in alignment_annotation.iteritems() :
            alignment._annotations[key] = value
            

        #TODO - Once the alignment object gets an append method, use it.
        #(i.e. an add SeqRecord method)
        alignment.add_sequence(self._query_descr, query_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == self._query_descr or record.description == self._query_descr
        #assert record.seq.tostring() == query_align_seq
        record.id = self._query_descr.split(None,1)[0].strip(",")
        record.name = "query"
        record.annotations["original_length"] = int(query_annotation["sq_len"])
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_annotation["al_start"])
        record._al_stop = int(query_annotation["al_stop"])

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_annotation :
            if query_annotation["sq_type"] == "D" :
                record.seq.alphabet = generic_dna
            elif query_annotation["sq_type"] == "p" :
                record.seq.alphabet = generic_protein
        if "-" in query_align_seq :
            if not hasattr(record.seq.alphabet,"gap_char") :
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")
        
        alignment.add_sequence(match_descr, match_align_seq)
        record = alignment.get_all_seqs()[-1]
        assert record.id == match_descr or record.description == match_descr
        #assert record.seq.tostring() == match_align_seq
        record.id = match_descr.split(None,1)[0].strip(",")
        record.name = "match"
        record.annotations["original_length"] = int(match_annotation["sq_len"])
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_annotation["al_start"])
        record._al_stop = int(query_annotation["al_stop"])

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_annotation :
            if match_annotation["sq_type"] == "D" :
                record.seq.alphabet = generic_dna
            elif match_annotation["sq_type"] == "p" :
                record.seq.alphabet = generic_protein
        if "-" in match_align_seq :
            if not hasattr(record.seq.alphabet,"gap_char") :
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Ejemplo n.º 49
0
    def next(self):
        try:
            line = self._header
            del self._header
        except AttributeError:
            line = self.handle.readline()
        if not line:
            # Empty file - just give up.
            return
        if not line.strip() == "# STOCKHOLM 1.0":
            raise ValueError("Did not find STOCKHOLM header")
            # import sys
            # print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0'

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while 1:
            line = self.handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == "# STOCKHOLM 1.0":
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier " + "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, "")
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == "#=GC ":
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == "#=GS ":
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids :
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids :
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip()  # append to any previous entry
                    # TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None and self.records_per_alignment != len(ids):
                raise ValueError(
                    "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)
                )

            alignment = Alignment(self.alphabet)

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            alignment_length = len(seqs.values()[0])
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError("Sequences have different lengths, or repeated identifier")
                name, start, end = self._identifier_split(id)
                alignment.add_sequence(id, seq, start=start, end=end)

                record = alignment.get_all_seqs()[-1]

                assert record.id == id or record.description == id

                record.id = id
                record.name = name
                record.description = id

                # will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                self._populate_meta_data(id, record)
            return alignment
        else:
            return None
Ejemplo n.º 50
0
    def next(self) :

        handle = self.handle

        try :
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:      
            line = handle.readline()
        if not line:
            return None

        while line.rstrip() != "#=======================================" :
            line = handle.readline()
            if not line :
                return None

        length_of_seqs = None
        number_of_seqs = None
        ids = []
        seqs = []


        while line[0] == "#" :
            #Read in the rest of this alignment header,
            #try and discover the number of records expected
            #and their length
            parts = line[1:].split(":",1)
            key = parts[0].lower().strip()
            if key == "aligned_sequences" :
                number_of_seqs = int(parts[1].strip())
                assert len(ids) == 0
                # Should now expect the record identifiers...
                for i in range(number_of_seqs) :
                    line = handle.readline()
                    parts = line[1:].strip().split(":",1)
                    assert i+1 == int(parts[0].strip())
                    ids.append(parts[1].strip())
                assert len(ids) == number_of_seqs
            if key == "length" :
                length_of_seqs = int(parts[1].strip())

            #And read in another line...
            line = handle.readline()

        if number_of_seqs is None :
            raise ValueError("Number of sequences missing!")
        if length_of_seqs is None :
            raise ValueError("Length of sequences missing!")

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (number_of_seqs, self.records_per_alignment))

        seqs = ["" for id in ids]
        index = 0

        #Parse the seqs
        while line :
            if len(line) > 21 :
                id_start = line[:21].strip().split(None, 1)
                seq_end = line[21:].strip().split(None, 1)
                if len(id_start) == 2 and len(seq_end) == 2:
                    #identifier, seq start position, seq, seq end position
                    #(an aligned seq is broken up into multiple lines)
                    id, start = id_start
                    seq, end = seq_end

                    #The identifier is truncated...
                    assert 0 <= index and index < number_of_seqs, \
                           "Expected index %i in range [0,%i)" \
                           % (index, number_of_seqs)
                    assert id==ids[index] or id == ids[index][:len(id)]

                    #Check the start...
                    if int(start) == 0:
                        #Special case when one sequence starts long before the other
                        assert len(seqs[index].replace("-",""))==0
                        assert len(seq.replace("-","")) == 0, line
                    elif int(start) == len(seqs[index].replace("-","")) :
                        #Special case when one sequence ends long before the other
                        assert len(seq.replace("-","")) == 0, line
                    else :
                        assert int(start) - 1 == len(seqs[index].replace("-","")), \
                        "Found %i chars so far for sequence %i (%s), file says start %i:\n%s" \
                            % (len(seqs[index].replace("-","")), index, id,
                               int(start), seqs[index])
                    
                    seqs[index] += seq

                    #Check the end ...
                    assert int(end) == len(seqs[index].replace("-","")), \
                        "Found %i chars so far for %s, file says end %i:\n%s" \
                            % (len(seqs[index]), id, int(end), repr(seqs[index]))

                    index += 1
                    if index >= number_of_seqs :
                        index = 0
                else :
                    #just a start value, this is just alignment annotation (?)
                    #print "Skipping: " + line.rstrip()
                    pass
            elif line.strip() == "" :
                #Just a spacer?
                pass
            else :
                print line
                assert False

            line = handle.readline()
            if line.rstrip() == "#---------------------------------------" \
            or line.rstrip() == "#=======================================" :
                #End of alignment
                self._header = line
                break

        assert index == 0

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids) :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (len(ids), self.records_per_alignment))

        alignment = Alignment(self.alphabet)
        for id, seq in zip(ids, seqs) :
            if len(seq) != length_of_seqs :
                #EMBOSS 2.9.0 is known to use spaces instead of minus signs
                #for leading gaps, and thus fails to parse.  This old version
                #is still used as of Dec 2008 behind the EBI SOAP webservice:
                #http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl
                raise ValueError("Error parsing alignment - sequences of "
                                 "different length? You could be using an "
                                 "old version of EMBOSS.")
            alignment.add_sequence(id, seq)
        return alignment
Ejemplo n.º 51
0
    def next(self):

        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            return None

        while line.rstrip() != "#=======================================":
            line = handle.readline()
            if not line:
                return None

        length_of_seqs = None
        number_of_seqs = None
        ids = []
        seqs = []

        while line[0] == "#":
            #Read in the rest of this alignment header,
            #try and discover the number of records expected
            #and their length
            parts = line[1:].split(":", 1)
            key = parts[0].lower().strip()
            if key == "aligned_sequences":
                number_of_seqs = int(parts[1].strip())
                assert len(ids) == 0
                # Should now expect the record identifiers...
                for i in range(number_of_seqs):
                    line = handle.readline()
                    parts = line[1:].strip().split(":", 1)
                    assert i + 1 == int(parts[0].strip())
                    ids.append(parts[1].strip())
                assert len(ids) == number_of_seqs
            if key == "length":
                length_of_seqs = int(parts[1].strip())

            #And read in another line...
            line = handle.readline()

        if number_of_seqs is None:
            raise ValueError("Number of sequences missing!")
        if length_of_seqs is None:
            raise ValueError("Length of sequences missing!")

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (number_of_seqs, self.records_per_alignment))

        seqs = ["" for id in ids]
        index = 0

        #Parse the seqs
        while line:
            if len(line) > 21:
                id_start = line[:21].strip().split(None, 1)
                seq_end = line[21:].strip().split(None, 1)
                if len(id_start) == 2 and len(seq_end) == 2:
                    #identifier, seq start position, seq, seq end position
                    #(an aligned seq is broken up into multiple lines)
                    id, start = id_start
                    seq, end = seq_end

                    #The identifier is truncated...
                    assert 0 <= index and index < number_of_seqs, \
                           "Expected index %i in range [0,%i)" \
                           % (index, number_of_seqs)
                    assert id == ids[index] or id == ids[index][:len(id)]

                    #Check the start...
                    if int(start) == 0:
                        #Special case when one sequence starts long before the other
                        assert len(seqs[index].replace("-", "")) == 0
                        assert len(seq.replace("-", "")) == 0, line
                    elif int(start) == len(seqs[index].replace("-", "")):
                        #Special case when one sequence ends long before the other
                        assert len(seq.replace("-", "")) == 0, line
                    else:
                        assert int(start) - 1 == len(seqs[index].replace("-","")), \
                        "Found %i chars so far for sequence %i (%s), file says start %i:\n%s" \
                            % (len(seqs[index].replace("-","")), index, id,
                               int(start), seqs[index])

                    seqs[index] += seq

                    #Check the end ...
                    assert int(end) == len(seqs[index].replace("-","")), \
                        "Found %i chars so far for %s, file says end %i:\n%s" \
                            % (len(seqs[index]), id, int(end), repr(seqs[index]))

                    index += 1
                    if index >= number_of_seqs:
                        index = 0
                else:
                    #just a start value, this is just alignment annotation (?)
                    #print "Skipping: " + line.rstrip()
                    pass
            elif line.strip() == "":
                #Just a spacer?
                pass
            else:
                print line
                assert False

            line = handle.readline()
            if line.rstrip() == "#---------------------------------------" \
            or line.rstrip() == "#=======================================" :
                #End of alignment
                self._header = line
                break

        assert index == 0

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids) :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (len(ids), self.records_per_alignment))

        alignment = Alignment(self.alphabet)
        for id, seq in zip(ids, seqs):
            if len(seq) != length_of_seqs:
                #EMBOSS 2.9.0 is known to use spaces instead of minus signs
                #for leading gaps, and thus fails to parse.  This old version
                #is still used as of Dec 2008 behind the EBI SOAP webservice:
                #http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl
                raise ValueError("Error parsing alignment - sequences of "
                                 "different length? You could be using an "
                                 "old version of EMBOSS.")
            alignment.add_sequence(id, seq)
        return alignment
Ejemplo n.º 52
0
    def next(self):

        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            return None

        while line.rstrip() <> "#=======================================":
            line = handle.readline()
            if not line:
                return None

        length_of_seqs = None
        number_of_seqs = None
        ids = []
        seqs = []

        while line[0] == "#":
            #Read in the rest of this alignment header,
            #try and discover the number of records expected
            #and their length
            parts = line[1:].split(":", 1)
            key = parts[0].lower().strip()
            if key == "aligned_sequences":
                number_of_seqs = int(parts[1].strip())
                assert len(ids) == 0
                # Should now expect the record identifiers...
                for i in range(number_of_seqs):
                    line = handle.readline()
                    parts = line[1:].strip().split(":", 1)
                    assert i + 1 == int(parts[0].strip())
                    ids.append(parts[1].strip())
                assert len(ids) == number_of_seqs
            if key == "length":
                length_of_seqs = int(parts[1].strip())

            #And read in another line...
            line = handle.readline()

        if number_of_seqs is None:
            raise SyntaxError("Number of sequences missing!")
        if length_of_seqs is None:
            raise SyntaxError("Length of sequences missing!")

        if self.records_per_alignment is not None \
        and self.records_per_alignment <> number_of_seqs :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (number_of_seqs, self.records_per_alignment))

        seqs = ["" for id in ids]
        index = 0

        #Parse the seqs
        while line:
            if len(line) > 21:
                id_start = line[:21].strip().split(None, 1)
                seq_end = line[21:].strip().split(None, 1)
                if len(id_start) == 2 and len(seq_end) == 2:
                    #identifier, seq start position, seq, seq end position
                    #(an aligned seq is broken up into multiple lines)
                    id, start = id_start
                    seq, end = seq_end

                    #The identifier is truncated...
                    assert 0 <= index and index < number_of_seqs, \
                           "Expected index %i in range [0,%i)" \
                           % (index, number_of_seqs)
                    assert id == ids[index] or id == ids[index][:len(id)]

                    #Check the start...
                    assert int(start) - 1 == len(seqs[index].replace("-","")), \
                        "Found %i chars so far for %s, file says start %i:\n%s" \
                            % (len(seqs[index]), id, int(start), seqs[index])

                    seqs[index] += seq

                    #Check the end ...
                    assert int(end) == len(seqs[index].replace("-","")), \
                        "Found %i chars so far for %s, file says end %i:\n%s" \
                            % (len(seqs[index]), id, int(end), seqs[index])

                    index += 1
                    if index >= number_of_seqs:
                        index = 0
                else:
                    #just a start value, this is just alignment annotation (?)
                    #print "Skipping: " + line.rstrip()
                    pass
            elif line.strip() == "":
                #Just a spacer?
                pass
            else:
                print line
                assert False

            line = handle.readline()
            if line.rstrip() == "#---------------------------------------" \
            or line.rstrip() == "#=======================================" :
                #End of alignment
                self._header = line
                break

        assert index == 0

        if self.records_per_alignment is not None \
        and self.records_per_alignment <> len(ids) :
            raise ValueError("Found %i records in this alignment, told to expect %i" \
                             % (len(ids), self.records_per_alignment))

        alignment = Alignment(self.alphabet)
        for id, seq in zip(ids, seqs):
            if len(seq) <> length_of_seqs:
                raise SyntaxError(
                    "Error parsing alignment - sequences of different length?")
            alignment.add_sequence(id, seq)
        return alignment