예제 #1
0
 def create_alignment(
     records,
     aligned_sequences,
     strands,
     annotations,
     column_annotations,
     score,
 ):
     """Create the Alignment object from the collected alignment data."""
     coordinates = Alignment.infer_coordinates(aligned_sequences)
     for record, strand, row in zip(records, strands, coordinates):
         if strand == "-":
             row[:] = row[-1] - row[0] - row
         start = record.seq.defined_ranges[0][0]
         row += start
     alignment = Alignment(records, coordinates)
     if annotations is not None:
         alignment.annotations = annotations
     if column_annotations is not None:
         alignment.column_annotations = column_annotations
     if score is not None:
         alignment.score = score
     return alignment
예제 #2
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        identifiers = None
        number_of_sequences = None
        for line in stream:
            line = line.rstrip("\r\n")
            if identifiers is None:
                # searching for alignment metadata start
                if not line:
                    continue
                elif line.startswith(
                        "#---------------------------------------"):
                    # may appear between alignments
                    continue
                elif line.startswith(
                        "#======================================="):
                    # found the alignment metadata start
                    identifiers = []
                    ncols = None
                    sequences = None
                    matrix = None
                    gap_penalty = None
                    extend_penalty = None
                    identity = None
                    similarity = None
                    gaps = None
                    score = None
                else:
                    raise ValueError("Unexpected line: %s" % line)
            elif sequences is None:
                # parsing the alignment metadata
                if line == "#=======================================":
                    # reached the end of alignment metadata
                    if len(identifiers) == 0:
                        raise ValueError("Number of sequences missing!")
                    if ncols is None:
                        raise ValueError("Length of alignment missing!")
                    sequences = [""] * number_of_sequences
                    aligned_sequences = [""] * number_of_sequences
                    consensus = ""
                    starts = [0] * number_of_sequences
                    ends = [0] * number_of_sequences
                    column = 0
                    index = 0
                    continue
                if line.strip() == "#":
                    continue
                if not line.startswith("# "):
                    raise ValueError("Unexpected line: %s") % line
                key, value = line[2:].split(":", 1)
                if key == "Aligned_sequences":
                    number_of_sequences = int(value.strip())
                    assert len(identifiers) == 0
                    # Should now expect the record identifiers...
                    for i, line in enumerate(stream):
                        if not line.startswith("# "):
                            raise ValueError("Unexpected line: %s") % line
                        number, identifier = line[2:].split(":")
                        assert i + 1 == int(number)
                        identifiers.append(identifier.strip())
                        if len(identifiers) == number_of_sequences:
                            break
                elif key == "Matrix":
                    matrix = value.strip()
                elif key == "Gap_penalty":
                    gap_penalty = float(value.strip())
                elif key == "Extend_penalty":
                    extend_penalty = float(value.strip())
                elif key == "Length":
                    ncols = int(value.strip())
                elif key == "Identity":
                    identity = int(value.strip().split("/")[0])
                elif key == "Similarity":
                    similarity = int(value.strip().split("/")[0])
                elif key == "Gaps":
                    gaps = int(value.strip().split("/")[0])
                elif key == "Score":
                    score = float(value.strip())
            else:
                # parse the sequences
                if not line:
                    # empty line
                    if index == number_of_sequences:
                        # reached the end of an alignment block
                        index = 0
                        if column == ncols:
                            # reached the end of the sequences
                            coordinates = Alignment.infer_coordinates(
                                aligned_sequences)
                            for i, start in enumerate(starts):
                                start -= 1  # Python counting
                                coordinates[i, :] += start
                            sequences = [
                                Seq(sequence) for sequence in sequences
                            ]
                            records = [
                                SeqRecord(sequence, id=identifier)
                                for sequence, identifier in zip(
                                    sequences, identifiers)
                            ]
                            alignment = Alignment(records, coordinates)
                            if matrix is not None:
                                alignment.matrix = matrix
                            if gap_penalty is not None:
                                alignment.gap_penalty = gap_penalty
                            if extend_penalty is not None:
                                alignment.extend_penalty = extend_penalty
                            if identity is not None:
                                alignment.identity = identity
                            if similarity is not None:
                                alignment.similarity = similarity
                            if gaps is not None:
                                alignment.gaps = gaps
                            if score is not None:
                                alignment.score = score
                            if consensus:
                                alignment.column_annotations = {
                                    "emboss_consensus": consensus
                                }
                            yield alignment
                            identifiers = None
                    continue
                prefix = line[:21].strip()
                if prefix == "":
                    # match line
                    consensus += line[21:71]
                else:
                    identifier, start = prefix.split(None, 1)
                    aligned_sequence, end = line[21:].split(None, 1)
                    start = int(start)
                    end = int(end)
                    sequence = aligned_sequence.replace("-", "")
                    if len(sequences[index]) > 0:
                        length = len(sequence)
                        if length == 0:
                            assert start == ends[index]
                            assert end == ends[index]
                        else:
                            assert start == ends[index] + 1
                            assert end == ends[index] + length
                    assert identifiers[index].startswith(identifier)
                    if starts[index] == 0:
                        # Record the start and end
                        starts[index] = start
                    ends[index] = end
                    sequences[index] += sequence
                    aligned_sequences[index] += aligned_sequence
                    if index == 0:
                        column += len(aligned_sequence)
                    else:
                        assert column == len(aligned_sequences[index])
                    index += 1
예제 #3
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        for line in stream:
            words = line.split()
            bedN = len(words)
            if bedN < 3 or bedN > 12:
                raise ValueError("expected between 3 and 12 columns, found %d" % bedN)
            chrom = words[0]
            chromStart = int(words[1])
            chromEnd = int(words[2])
            if bedN > 3:
                name = words[3]
            else:
                name = None
            if bedN > 5:
                strand = words[5]
            else:
                strand = "+"
            if bedN > 9:
                blockCount = int(words[9])
                blockSizes = [
                    int(blockSize) for blockSize in words[10].rstrip(",").split(",")
                ]
                blockStarts = [
                    int(blockStart) for blockStart in words[11].rstrip(",").split(",")
                ]
                if len(blockSizes) != blockCount:
                    raise ValueError(
                        "Inconsistent number of block sizes (%d found, expected %d)"
                        % (len(blockSizes), blockCount)
                    )
                if len(blockStarts) != blockCount:
                    raise ValueError(
                        "Inconsistent number of block start positions (%d found, expected %d)"
                        % (len(blockStarts), blockCount)
                    )
                blockSizes = numpy.array(blockSizes)
                blockStarts = numpy.array(blockStarts)
                tPosition = 0
                qPosition = 0
                coordinates = [[tPosition, qPosition]]
                for blockSize, blockStart in zip(blockSizes, blockStarts):
                    if blockStart != tPosition:
                        coordinates.append([blockStart, qPosition])
                        tPosition = blockStart
                    tPosition += blockSize
                    qPosition += blockSize
                    coordinates.append([tPosition, qPosition])
                coordinates = numpy.array(coordinates).transpose()
                qSize = sum(blockSizes)
            else:
                blockSize = chromEnd - chromStart
                coordinates = numpy.array([[0, blockSize], [0, blockSize]])
                qSize = blockSize
            coordinates[0, :] += chromStart
            query_sequence = Seq(None, length=qSize)
            query_record = SeqRecord(query_sequence, id=name)
            target_record = SeqRecord(None, id=chrom)
            records = [target_record, query_record]
            if strand == "-":
                coordinates[1, :] = qSize - coordinates[1, :]
            if chromStart != coordinates[0, 0]:
                raise ValueError(
                    "Inconsistent chromStart found (%d, expected %d)"
                    % (chromStart, coordinates[0, 0])
                )
            if chromEnd != coordinates[0, -1]:
                raise ValueError(
                    "Inconsistent chromEnd found (%d, expected %d)"
                    % (chromEnd, coordinates[0, -1])
                )
            alignment = Alignment(records, coordinates)
            if bedN <= 4:
                yield alignment
                continue
            score = words[4]
            try:
                score = float(score)
            except ValueError:
                pass
            else:
                if score.is_integer():
                    score = int(score)
            alignment.score = score
            if bedN <= 6:
                yield alignment
                continue
            alignment.thickStart = int(words[6])
            if bedN <= 7:
                yield alignment
                continue
            alignment.thickEnd = int(words[7])
            if bedN <= 8:
                yield alignment
                continue
            alignment.itemRgb = words[8]
            yield alignment