Example #1
0
    def test_write_genbank_sequence(self, tmpdir, genbank_reference):
        output_filename = Path(tmpdir) / Path("new_sequences.fasta")

        reference = SeqIO.read(genbank_reference, "genbank")
        sequences_written = write_sequences([reference], output_filename,
                                            "genbank")
        assert sequences_written == 1
Example #2
0
    def test_write_single_set_of_sequences_to_lzma_file(
            self, tmpdir, sequences):
        output_filename = Path(tmpdir) / Path("new_sequences.fasta.xz")
        sequences_written = write_sequences(sequences, output_filename,
                                            "fasta")
        assert sequences_written == len(sequences)

        with lzma.open(output_filename, "rt") as handle:
            assert sequences_written == len(
                [line for line in handle if line.startswith(">")])
Example #3
0
    def test_write_sequences_by_external_handle(self, tmpdir, sequences):
        output_filename = Path(tmpdir) / Path("new_sequences.fasta")

        with open_file(output_filename, "w") as handle:
            total_sequences_written = 0
            for sequence in sequences:
                sequences_written = write_sequences(sequence, handle)
                total_sequences_written += sequences_written

        with open(output_filename, "r") as handle:
            assert total_sequences_written == len(
                [line for line in handle if line.startswith(">")])
Example #4
0
        prefixes = "|".join(args.strip_prefixes)
        pattern = f"^({prefixes})|{pattern}"

    with open_file(args.output, "w") as output_handle:
        # In order to prefer the latter files, we have to reverse the order of
        # the files.
        sequences = read_sequences(*reversed(sequence_files))
        renamed_sequences = rename_sequences(sequences, pattern)
        deduplicated_sequences = drop_duplicate_sequences(
            renamed_sequences,
            args.error_on_duplicate_strains
        )

        try:
            for sequence in deduplicated_sequences:
                write_sequences(sequence, output_handle)
        except DuplicateSequenceError as error:
            print(
                f"ERROR: The following strains have duplicate sequences: {error}",
                file=sys.stderr
            )
            sys.exit(1)

    # Clean up any open sequence files.
    for sequence_file in sequence_files:
        if hasattr(sequence_file, "close"):
            sequence_file.close()

    # Clean up any open tarballs.
    for tar_handle in tar_handles:
        tar_handle.close()
Example #5
0
                        nargs='+',
                        type=int,
                        help="list of sites to mask")
    parser.add_argument("--output",
                        required=True,
                        help="FASTA file of output alignment")
    args = parser.parse_args()

    begin_length = 0
    if args.mask_from_beginning:
        begin_length = args.mask_from_beginning
    end_length = 0
    if args.mask_from_end:
        end_length = args.mask_from_end

    with open_file(args.output, 'w') as outfile:
        for record in read_sequences(args.alignment):
            seq = str(record.seq)
            if args.mask_terminal_gaps:
                seq = mask_terminal_gaps(seq)

            start = "N" * begin_length
            middle = seq[begin_length:-end_length]
            end = "N" * end_length
            seq_list = list(start + middle + end)
            if args.mask_sites:
                for site in args.mask_sites:
                    seq_list[site - 1] = "N"
            record.seq = Seq("".join(seq_list))
            write_sequences(record, outfile)
Example #6
0
 def test_write_sequences_from_generator(self, tmpdir, sequences_generator):
     output_filename = Path(tmpdir) / Path("new_sequences.fasta")
     sequences_written = write_sequences(sequences_generator,
                                         output_filename, "fasta")
     assert sequences_written == 3
Example #7
0
 def test_write_sequences(self, tmpdir, sequences):
     output_filename = Path(tmpdir) / Path("new_sequences.fasta")
     sequences_written = write_sequences(sequences, output_filename,
                                         "fasta")
     assert sequences_written == len(sequences)
            # same name already exists and if the hash is different.
            sequence_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
            if record.name in sequence_hash_by_name:
                # If the hashes differ (multiple entries with the same
                # strain name but different sequences), we keep the first
                # sequence and add the strain to a list of duplicates to
                # report at the end.
                if sequence_hash_by_name.get(record.name) != sequence_hash:
                    duplicate_strains.add(record.name)

                # If the current strain has been seen before, don't write
                # out its sequence again.
                continue

            sequence_hash_by_name[record.name] = sequence_hash
            write_sequences(record, output_handle)

    if len(duplicate_strains) > 0:
        error_mode = "ERROR"
        exit_code = 1

        if args.warn_about_duplicates:
            error_mode = "WARNING"
            exit_code = 0

        print(
            f"{error_mode}: Detected the following duplicate input strains with different sequences:",
            file=sys.stderr
        )
        for strain in duplicate_strains:
            print(textwrap.indent(strain, "    "), file=sys.stderr)