def create_alignment( records, aligned_sequences, strands, annotations, column_annotations, score, ): """Create the Alignment object from the collected alignment data.""" coordinates = Alignment.infer_coordinates(aligned_sequences) for record, strand, row in zip(records, strands, coordinates): if strand == "-": row[:] = row[-1] - row[0] - row start = record.seq.defined_ranges[0][0] row += start alignment = Alignment(records, coordinates) if annotations is not None: alignment.annotations = annotations if column_annotations is not None: alignment.column_annotations = column_annotations if score is not None: alignment.score = score return alignment
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration identifiers = None number_of_sequences = None for line in stream: line = line.rstrip("\r\n") if identifiers is None: # searching for alignment metadata start if not line: continue elif line.startswith( "#---------------------------------------"): # may appear between alignments continue elif line.startswith( "#======================================="): # found the alignment metadata start identifiers = [] ncols = None sequences = None matrix = None gap_penalty = None extend_penalty = None identity = None similarity = None gaps = None score = None else: raise ValueError("Unexpected line: %s" % line) elif sequences is None: # parsing the alignment metadata if line == "#=======================================": # reached the end of alignment metadata if len(identifiers) == 0: raise ValueError("Number of sequences missing!") if ncols is None: raise ValueError("Length of alignment missing!") sequences = [""] * number_of_sequences aligned_sequences = [""] * number_of_sequences consensus = "" starts = [0] * number_of_sequences ends = [0] * number_of_sequences column = 0 index = 0 continue if line.strip() == "#": continue if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line key, value = line[2:].split(":", 1) if key == "Aligned_sequences": number_of_sequences = int(value.strip()) assert len(identifiers) == 0 # Should now expect the record identifiers... for i, line in enumerate(stream): if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line number, identifier = line[2:].split(":") assert i + 1 == int(number) identifiers.append(identifier.strip()) if len(identifiers) == number_of_sequences: break elif key == "Matrix": matrix = value.strip() elif key == "Gap_penalty": gap_penalty = float(value.strip()) elif key == "Extend_penalty": extend_penalty = float(value.strip()) elif key == "Length": ncols = int(value.strip()) elif key == "Identity": identity = int(value.strip().split("/")[0]) elif key == "Similarity": similarity = int(value.strip().split("/")[0]) elif key == "Gaps": gaps = int(value.strip().split("/")[0]) elif key == "Score": score = float(value.strip()) else: # parse the sequences if not line: # empty line if index == number_of_sequences: # reached the end of an alignment block index = 0 if column == ncols: # reached the end of the sequences coordinates = Alignment.infer_coordinates( aligned_sequences) for i, start in enumerate(starts): start -= 1 # Python counting coordinates[i, :] += start sequences = [ Seq(sequence) for sequence in sequences ] records = [ SeqRecord(sequence, id=identifier) for sequence, identifier in zip( sequences, identifiers) ] alignment = Alignment(records, coordinates) if matrix is not None: alignment.matrix = matrix if gap_penalty is not None: alignment.gap_penalty = gap_penalty if extend_penalty is not None: alignment.extend_penalty = extend_penalty if identity is not None: alignment.identity = identity if similarity is not None: alignment.similarity = similarity if gaps is not None: alignment.gaps = gaps if score is not None: alignment.score = score if consensus: alignment.column_annotations = { "emboss_consensus": consensus } yield alignment identifiers = None continue prefix = line[:21].strip() if prefix == "": # match line consensus += line[21:71] else: identifier, start = prefix.split(None, 1) aligned_sequence, end = line[21:].split(None, 1) start = int(start) end = int(end) sequence = aligned_sequence.replace("-", "") if len(sequences[index]) > 0: length = len(sequence) if length == 0: assert start == ends[index] assert end == ends[index] else: assert start == ends[index] + 1 assert end == ends[index] + length assert identifiers[index].startswith(identifier) if starts[index] == 0: # Record the start and end starts[index] = start ends[index] = end sequences[index] += sequence aligned_sequences[index] += aligned_sequence if index == 0: column += len(aligned_sequence) else: assert column == len(aligned_sequences[index]) index += 1
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration for line in stream: words = line.split() bedN = len(words) if bedN < 3 or bedN > 12: raise ValueError("expected between 3 and 12 columns, found %d" % bedN) chrom = words[0] chromStart = int(words[1]) chromEnd = int(words[2]) if bedN > 3: name = words[3] else: name = None if bedN > 5: strand = words[5] else: strand = "+" if bedN > 9: blockCount = int(words[9]) blockSizes = [ int(blockSize) for blockSize in words[10].rstrip(",").split(",") ] blockStarts = [ int(blockStart) for blockStart in words[11].rstrip(",").split(",") ] if len(blockSizes) != blockCount: raise ValueError( "Inconsistent number of block sizes (%d found, expected %d)" % (len(blockSizes), blockCount) ) if len(blockStarts) != blockCount: raise ValueError( "Inconsistent number of block start positions (%d found, expected %d)" % (len(blockStarts), blockCount) ) blockSizes = numpy.array(blockSizes) blockStarts = numpy.array(blockStarts) tPosition = 0 qPosition = 0 coordinates = [[tPosition, qPosition]] for blockSize, blockStart in zip(blockSizes, blockStarts): if blockStart != tPosition: coordinates.append([blockStart, qPosition]) tPosition = blockStart tPosition += blockSize qPosition += blockSize coordinates.append([tPosition, qPosition]) coordinates = numpy.array(coordinates).transpose() qSize = sum(blockSizes) else: blockSize = chromEnd - chromStart coordinates = numpy.array([[0, blockSize], [0, blockSize]]) qSize = blockSize coordinates[0, :] += chromStart query_sequence = Seq(None, length=qSize) query_record = SeqRecord(query_sequence, id=name) target_record = SeqRecord(None, id=chrom) records = [target_record, query_record] if strand == "-": coordinates[1, :] = qSize - coordinates[1, :] if chromStart != coordinates[0, 0]: raise ValueError( "Inconsistent chromStart found (%d, expected %d)" % (chromStart, coordinates[0, 0]) ) if chromEnd != coordinates[0, -1]: raise ValueError( "Inconsistent chromEnd found (%d, expected %d)" % (chromEnd, coordinates[0, -1]) ) alignment = Alignment(records, coordinates) if bedN <= 4: yield alignment continue score = words[4] try: score = float(score) except ValueError: pass else: if score.is_integer(): score = int(score) alignment.score = score if bedN <= 6: yield alignment continue alignment.thickStart = int(words[6]) if bedN <= 7: yield alignment continue alignment.thickEnd = int(words[7]) if bedN <= 8: yield alignment continue alignment.itemRgb = words[8] yield alignment