Exemple #1
0
def interleave(infile_1, infile_2, outfile):
    seq_reader_1 = sequences.file_reader(infile_1)
    seq_reader_2 = sequences.file_reader(infile_2)
    f_out = utils.open_file_write(outfile)

    for seq_1 in seq_reader_1:
        try:
            seq_2 = next(seq_reader_2)
        except:
            utils.close(f_out)
            raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue')

        print(seq_1, file=f_out)
        print(seq_2, file=f_out)

    try:
        seq_2 = next(seq_reader_2)
    except:
        seq_2 = None

    if seq_2 is not None:
        utils.close(f_out)
        raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue')

    utils.close(f_out)
Exemple #2
0
def interleave(infile_1, infile_2, outfile, suffix1=None, suffix2=None):
    '''Makes interleaved file from two sequence files. If used, will append suffix1 onto end
    of every sequence name in infile_1, unless it already ends with suffix1. Similar for sufffix2.'''
    seq_reader_1 = sequences.file_reader(infile_1)
    seq_reader_2 = sequences.file_reader(infile_2)
    f_out = utils.open_file_write(outfile)

    for seq_1 in seq_reader_1:
        try:
            seq_2 = next(seq_reader_2)
        except:
            utils.close(f_out)
            raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue')

        if suffix1 is not None and not seq_1.id.endswith(suffix1):
            seq_1.id += suffix1
        if suffix2 is not None and not seq_2.id.endswith(suffix2):
            seq_2.id += suffix2

        print(seq_1, file=f_out)
        print(seq_2, file=f_out)

    try:
        seq_2 = next(seq_reader_2)
    except:
        seq_2 = None

    if seq_2 is not None:
        utils.close(f_out)
        raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue')

    utils.close(f_out)
Exemple #3
0
    def test_file_reader_gff(self):
        '''Test read gff file'''
        good_files = [
            'sequences_test_gffv3.gff',
            'sequences_test_gffv3.no_FASTA_line.gff'
        ]
        good_files = [os.path.join(data_dir, x) for x in good_files]

        for f in good_files:
            reader = sequences.file_reader(f)
            counter = 1
            for seq in reader:
                self.assertEqual(seq, sequences.Fasta('seq' + str(counter), 'ACGTACGTAC'))
                counter += 1

        bad_files = [
            'sequences_test_gffv3.no_seq.gff',
            'sequences_test_gffv3.no_seq.2.gff'
        ]
        bad_files = [os.path.join(data_dir, x) for x in bad_files]

        for filename in bad_files:
            with self.assertRaises(sequences.Error):
                reader = sequences.file_reader(filename)
                for seq in reader:
                    pass
def run(description):
    parser = argparse.ArgumentParser(
        description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' +
                      'from a mates file.  Output is interleaved if mates file given',
        usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>')
    parser.add_argument('--mate_file', help='Name of mates file')
    parser.add_argument('--seed', help='Seed for random number generator. If not given, python\'s default is used', metavar='INT')
    parser.add_argument('infile', help='Name of input file')
    parser.add_argument('outfile', help='Name of output file')
    parser.add_argument('percent', type=float, help='Per cent probability of keeping any given read (pair) in [0,100]', metavar='FLOAT')
    options = parser.parse_args()

    random.seed(a=options.seed)
    seq_reader = sequences.file_reader(options.infile)
    fout = utils.open_file_write(options.outfile)

    if options.mate_file:
        mate_seq_reader = sequences.file_reader(options.mate_file)

    for seq in seq_reader:
        if options.mate_file:
            try:
                mate_seq = next(mate_seq_reader)
            except StopIteration:
                print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr)
                sys.exit(1)
        if 100 * random.random() <= options.percent:
            print(seq, file=fout)
            if options.mate_file:
                print(mate_seq, file=fout)

    utils.close(fout)
Exemple #5
0
def filter(
      infile,
      outfile,
      minlength=0,
      maxlength=float('inf'),
      regex=None,
      ids_file=None,
      invert=False,
      mate_in=None,
      mate_out=None,
      both_mates_pass=True,
    ):

    ids_from_file = set()
    if ids_file is not None:
        f = utils.open_file_read(ids_file)
        for line in f:
            ids_from_file.add(line.rstrip())
        utils.close(f)

    if mate_in:
        if mate_out is None:
            raise Error('Error in filter! mate_in provided. Must also provide mate_out')

        seq_reader_mate = sequences.file_reader(mate_in)
        f_out_mate = utils.open_file_write(mate_out)

    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    if regex is not None:
        r = re.compile(regex)


    def passes(seq):
        return minlength <= len(seq) <= maxlength \
              and (regex is None or r.search(seq.id) is not None) \
              and (ids_file is None or seq.id in ids_from_file)

    for seq in seq_reader:
        seq_passes = passes(seq)
        if mate_in:
            try:
                seq_mate = next(seq_reader_mate)
            except:
                utils.close(f_out)
                raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue')

            mate_passes = passes(seq_mate)
            want_the_pair = (seq_passes and mate_passes) \
                            or (( seq_passes or mate_passes) and not both_mates_pass)
            if want_the_pair != invert:
                print(seq, file=f_out)
                print(seq_mate, file=f_out_mate)
        elif seq_passes != invert:
            print(seq, file=f_out)
    utils.close(f_out)
    if mate_in:
        utils.close(f_out_mate)
Exemple #6
0
def fasta_to_fastq(fasta_in, qual_in, outfile):
    fa_reader = sequences.file_reader(fasta_in)
    qual_reader = sequences.file_reader(qual_in, read_quals=True)
    f_out = utils.open_file_write(outfile)

    for seq in fa_reader:
        qual = next(qual_reader)
        if seq.id != qual.id:
            utils.close(f_out)
            raise Error('Mismatch in names from fasta and qual file', seq.id, qual.id)

        qual.seq = [int(x) for x in qual.seq.split()]
        print(seq.to_Fastq(qual.seq), file=f_out)

    utils.close(f_out)
Exemple #7
0
def trim_contigs(infile, outfile, trim):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        if len(seq) < 2 * trim:
            continue

        gaps = seq.gaps()
        bases = list(seq.seq)

        # extend the length of each gap
        for gap in gaps:
            left_start = max(gap.start - trim, 0)
            right_end = min(gap.end + trim + 1, len(seq))

            for i in range(left_start, gap.start):
                bases[i] = 'N'

            for i in range(gap.end, right_end):
                bases[i] = 'N'

        seq.seq = ''.join(bases)

        # trim start/end bases and tidy up any resulting Ns at either end of the trimmed seq
        seq.trim(trim, trim)
        seq.trim_Ns()

        # check that there is some non-N sequence left over
        regex = re.compile('[^nN]')
        if regex.search(seq.seq) is not None:
            print(seq, file=fout)

    utils.close(fout)
Exemple #8
0
def count_sequences(infile):
    '''Returns the number of sequences in a file'''
    seq_reader = sequences.file_reader(infile)
    n = 0
    for seq in seq_reader:
        n += 1
    return n
Exemple #9
0
def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False, check_unique=False):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    original_line_length = sequences.Fasta.line_length
    sequences.Fasta.line_length = line_length
    if check_unique:
        used_names = {}

    for seq in seq_reader:
        if strip_after_first_whitespace:
            seq.strip_after_first_whitespace()

        if check_unique:
            used_names[seq.id] = used_names.get(seq.id, 0) + 1

        if type(seq) == sequences.Fastq:
            print(sequences.Fasta(seq.id, seq.seq), file=f_out)
        else:
            print(seq, file=f_out)

    utils.close(f_out)
    sequences.Fasta.line_length = original_line_length

    if check_unique:
        all_unique = True

        for name, count in used_names.items():
            if count > 1:
                print('Sequence name "' + name + '" not unique. Found', count, 'times', file=sys.stderr)
                all_unique = False

        if not all_unique:
            raise Error('Not all sequence names unique. Cannot continue')
Exemple #10
0
def acgtn_only(infile, outfile):
    '''Replace every non-acgtn (case insensitve) character with an N'''
    f = utils.open_file_write(outfile)
    for seq in sequences.file_reader(infile):
        seq.replace_non_acgt()
        print(seq, file=f)
    utils.close(f)
Exemple #11
0
 def test_file_reader_fasta(self):
     '''file_reader should iterate through a fasta file correctly'''
     reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.fa'))
     counter = 1
     for seq in reader:
         self.assertEqual(seq, sequences.Fasta(str(counter), 'ACGTA'))
         counter += 1
Exemple #12
0
def translate(infile, outfile, frame=0):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        print(seq.translate(frame=frame), file=fout)

    utils.close(fout)
Exemple #13
0
def reverse_complement(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.revcomp()
        print(seq, file=fout)

    utils.close(fout)
Exemple #14
0
def replace_bases(infile, outfile, old, new):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.replace_bases(old, new)
        print(seq, file=f_out)

    utils.close(f_out)
Exemple #15
0
def strip_illumina_suffix(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.strip_illumina_suffix()
        print(seq, file=f_out)

    utils.close(f_out)
Exemple #16
0
def split_by_fixed_size(infile, outfiles_prefix, chunk_size, tolerance, skip_if_all_Ns=False):
    '''Splits  fasta/q file into separate files, with up to (chunk_size + tolerance) bases in each file'''
    file_count = 1
    coords = []
    small_sequences = []  # sequences shorter than chunk_size
    seq_reader = sequences.file_reader(infile)
    f_coords = utils.open_file_write(outfiles_prefix + '.coords')

    for seq in seq_reader:
        if skip_if_all_Ns and seq.is_all_Ns():
             continue
        if len(seq) < chunk_size:
            small_sequences.append(copy.copy(seq))
        elif len(seq) <= chunk_size + tolerance:
            f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
            print(seq, file=f)
            utils.close(f)
            file_count += 1
        else:
            # make list of chunk coords
            chunks = [(x,x+chunk_size) for x in range(0, len(seq), chunk_size)]
            if chunks[-1][1] - 1 > len(seq):
                chunks[-1] = (chunks[-1][0], len(seq))
            if len(chunks) > 1 and (chunks[-1][1] - chunks[-1][0]) <= tolerance:
                chunks[-2] = (chunks[-2][0], chunks[-1][1])
                chunks.pop()

            # write one output file per chunk
            offset = 0
            for chunk in chunks:
                if not(skip_if_all_Ns and seq.is_all_Ns(start=chunk[0], end=chunk[1]-1)):
                    f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
                    chunk_id = seq.id + ':' + str(chunk[0]+1) + '-' + str(chunk[1])
                    print(sequences.Fasta(chunk_id, seq[chunk[0]:chunk[1]]), file=f)
                    print(chunk_id, seq.id, offset, sep='\t', file=f_coords)
                    utils.close(f)
                    file_count += 1

                offset += chunk[1] - chunk[0]

    # write files of small sequences
    if len(small_sequences):
        f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
        file_count += 1
        base_count = 0
        for seq in small_sequences:
            if base_count > 0 and base_count + len(seq) > chunk_size + tolerance:
                utils.close(f)
                f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
                file_count += 1
                base_count = 0

            print(seq, file=f)
            base_count += len(seq)

        utils.close(f)
Exemple #17
0
def search_for_seq(infile, outfile, search_string):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        hits = seq.search(search_string)
        for hit in hits:
            print(seq.id, hit[0]+1, hit[1], sep='\t', file=fout)

    utils.close(fout)
Exemple #18
0
def trim_Ns_at_end(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.trim_Ns()
        if len(seq):
            print(seq, file=fout)

    utils.close(fout)
Exemple #19
0
def trim(infile, outfile, start, end):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.trim(start, end)
        if len(seq):
            print(seq, file=fout)

    utils.close(fout)
Exemple #20
0
def to_fasta_union(infile, outfile, seqname='union'):
    seq_reader = sequences.file_reader(infile)
    new_seq = []

    for seq in seq_reader:
        new_seq.append(seq.seq)

    f_out = utils.open_file_write(outfile)
    print(sequences.Fasta(seqname, ''.join(new_seq)), file=f_out)
    utils.close(f_out)
Exemple #21
0
def to_boulderio(infile, outfile):
    '''Converts input sequence file into a "Boulder-IO format", as used by primer3'''
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)

    for sequence in seq_reader:
        print("SEQUENCE_ID=" + sequence.id, file=f_out)
        print("SEQUENCE_TEMPLATE=" + sequence.seq, file=f_out)
        print("=", file=f_out)

    utils.close(f_out)
Exemple #22
0
    def test_file_reader_embl(self):
        '''Test read embl file'''
        reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.embl'))

        counter = 1
        for seq in reader:
            self.assertEqual(seq, sequences.Fasta('seq' + str(counter), expected_embl[counter-1]))
            counter += 1

        bad_files = [
            'sequences_test.embl.bad',
            'sequences_test.embl.bad2',
        ]
        bad_files = [os.path.join(data_dir, x) for x in bad_files]

        for filename in bad_files:
            with self.assertRaises(sequences.Error):
                reader = sequences.file_reader(filename)
                for seq in reader:
                    pass
Exemple #23
0
def expand_nucleotides(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seqs = seq.expand_nucleotides()
        if len(seqs) > 1:
            for s in seqs:
                print(s, file=fout)
        else:
            print(seq, file=fout)
Exemple #24
0
def make_long_reads(infile, outfile, method='tiling', fixed_read_length=20000, tile_step=10000, gamma_shape=1.2,  gamma_scale=6000, coverage=10, gamma_min_length=20000, seed=None, ins_skip=None, ins_window=None,):
    assert method in ['tiling', 'gamma', 'uniform']
    assert ins_skip == ins_window == None or None not in [ins_skip, ins_window]
    if seed is not None:
        random.seed(a=seed)
    seq_reader = sequences.file_reader(infile)
    f = utils.open_file_write(outfile)

    for seq in seq_reader:
        if method == 'tiling':
            if len(seq) < fixed_read_length:
                print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr)
                continue
            for i in range(0, len(seq), tile_step):
                end = min(len(seq), i + fixed_read_length)
                fa = sequences.Fasta('_'.join([seq.id, str(i + 1), str(end)]), seq[i:end])
                if ins_skip:
                    fa.add_insertions(skip=ins_skip, window=ins_window)
                print(fa, file=f)
                if end >= len(seq):
                    break
        elif method == 'gamma':
            if len(seq) < gamma_min_length:
                print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr)
                continue
            total_read_length = 0
            while total_read_length < coverage * len(seq) - 0.5 * gamma_min_length:
                read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale))
                while read_length < gamma_min_length or read_length > len(seq):
                    read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale))

                start = random.randint(0, len(seq) - read_length)
                end = start + read_length - 1
                fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1])
                total_read_length += len(fa)
                if ins_skip:
                    fa.add_insertions(skip=ins_skip, window=ins_window)
                print(fa, file=f)
        elif method == 'uniform':
            if len(seq) < fixed_read_length:
                print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr)
                continue
            total_read_length = 0
            while total_read_length < coverage * len(seq) - 0.5 * fixed_read_length:
                start = random.randint(0, len(seq) - fixed_read_length)
                end = start + fixed_read_length - 1
                fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1])
                total_read_length += len(fa)
                if ins_skip:
                    fa.add_insertions(skip=ins_skip, window=ins_window)
                print(fa, file=f)


    utils.close(f)
Exemple #25
0
def sequence_trim(infile_1, infile_2, outfile_1, outfile_2, to_trim_file, min_length=50, check_revcomp=False):
    to_trim_seqs = {}
    file_to_dict(to_trim_file, to_trim_seqs)
    trim_seqs = [x.seq for x in to_trim_seqs.values()]
    if check_revcomp:
        for seq in to_trim_seqs.values():
            seq.revcomp()
        trim_seqs_revcomp = [x.seq for x in to_trim_seqs.values()]
    else:
        trim_seqs_revcomp = []

    seq_reader_1 = sequences.file_reader(infile_1)
    seq_reader_2 = sequences.file_reader(infile_2)
    f_out_1 = utils.open_file_write(outfile_1)
    f_out_2 = utils.open_file_write(outfile_2)

    for seq_1 in seq_reader_1:
        try:
            seq_2 = next(seq_reader_2)
        except:
            utils.close(f_out)
            raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue')

        for seq in seq_1, seq_2:
            for trim_seq in trim_seqs:
                if seq.seq.startswith(trim_seq):
                    seq.trim(len(trim_seq),0)
                    break

            for trim_seq in trim_seqs_revcomp:
                if seq.seq.endswith(trim_seq):
                    seq.trim(0,len(trim_seq))
                    break

        if len(seq_1) >= min_length and len(seq_2) >= min_length:
            print(seq_1, file=f_out_1)
            print(seq_2, file=f_out_2)


    utils.close(f_out_1)
    utils.close(f_out_2)
Exemple #26
0
    def test_file_reader_phylip(self):
        '''Test read phylip file'''
        test_files = [
            'sequences_test_phylip.interleaved',
            'sequences_test_phylip.interleaved2',
            'sequences_test_phylip.sequential'
        ]

        test_files = [os.path.join(data_dir, f) for f in test_files]

        expected_seqs = [
            sequences.Fasta('Turkey',
                            'AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT'),
            sequences.Fasta('Salmo_gair',
                            'AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT'),
            sequences.Fasta('H. Sapiens',
                            'ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA')
        ]

        for fname in test_files:
            reader = sequences.file_reader(fname)
            i = 0
            for seq in reader:
                self.assertEqual(expected_seqs[i], seq)
                i += 1

        # files made by seaview are a little different in the first line.
        # Test one of these
        expected_seqs = [
            sequences.Fasta('seq1', 96 * 'G' + 'T'),
            sequences.Fasta('seq2', 94 * 'A' + 'G')
        ]

        reader = sequences.file_reader(
            os.path.join(data_dir, 'sequences_test_phylip.made_by_seaview'))
        i = 0
        for seq in reader:
            print(seq)
            self.assertEqual(expected_seqs[i], seq)
            i += 1
Exemple #27
0
def fastaq_to_fake_qual(infile, outfile, q=40):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        print('>' + seq.id, file=fout)
        if sequences.Fasta.line_length == 0:
            print(' '.join([str(q)] * len(seq)), file=fout)
        else:
            for i in range(0, len(seq), sequences.Fasta.line_length):
                print(' '.join([str(q)] * min(sequences.Fasta.line_length, len(seq) - i)), file=fout)

    utils.close(fout)
Exemple #28
0
def mean_length(infile, limit=None):
    '''Returns the mean length of the sequences in the input file. By default uses all sequences. To limit to the first N sequences, use limit=N'''
    total = 0
    count = 0
    seq_reader = sequences.file_reader(infile)
    for seq in seq_reader:
        total += len(seq)
        count += 1
        if limit is not None and count >= limit:
            break

    assert count > 0
    return total / count
Exemple #29
0
def mean_length(infile, limit=None):
    '''Returns the mean length of the sequences in the input file. By default uses all sequences. To limit to the first N sequences, use limit=N'''
    total = 0
    count = 0
    seq_reader = sequences.file_reader(infile)
    for seq in seq_reader:
        total += len(seq)
        count += 1
        if limit is not None and count >= limit:
            break

    assert count > 0
    return total / count
Exemple #30
0
def fastaq_to_fake_qual(infile, outfile, q=40):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        print('>' + seq.id, file=fout)
        if sequences.Fasta.line_length == 0:
            print(' '.join([str(q)] * len(seq)), file=fout)
        else:
            for i in range(0, len(seq), sequences.Fasta.line_length):
                print(' '.join([str(q)] * min(sequences.Fasta.line_length, len(seq) - i)), file=fout)

    utils.close(fout)
Exemple #31
0
def fastaq_to_orfs_gff(infile, outfile, min_length=300, tool_name='fastaq'):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)
    for seq in seq_reader:
        orfs = seq.all_orfs(min_length=min_length)
        for coords, revcomp in orfs:
            if revcomp:
                strand = '-'
            else:
                strand = '+'

            print(seq.id, tool_name, 'CDS', coords.start+1, coords.end+1, '.', strand, '.', sep='\t', file=fout)

    utils.close(fout)
Exemple #32
0
def fastaq_to_orfs_gff(infile, outfile, min_length=300, tool_name='fastaq'):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)
    for seq in seq_reader:
        orfs = seq.all_orfs(min_length=min_length)
        for coords, revcomp in orfs:
            if revcomp:
                strand = '-'
            else:
                strand = '+'

            print(seq.id, tool_name, 'CDS', coords.start+1, coords.end+1, '.', strand, '.', sep='\t', file=fout)

    utils.close(fout)
Exemple #33
0
def run(description):
    parser = argparse.ArgumentParser(
        description=
        'Takes a random subset of reads from a sequence file and optionally the corresponding read '
        + 'from a mates file.  Ouptut is interleaved if mates file given',
        usage='fastaq to_random_subset [options] <infile> <outfile> <percent>')
    parser.add_argument('--mate_file', help='Name of mates file')
    parser.add_argument('infile', help='Name of input file')
    parser.add_argument('outfile', help='Name of output file')
    parser.add_argument(
        'percent',
        type=int,
        help='Per cent probability of keeping any given read (pair) in [0,100]',
        metavar='INT')
    options = parser.parse_args()

    seq_reader = sequences.file_reader(options.infile)
    fout = utils.open_file_write(options.outfile)

    if options.mate_file:
        mate_seq_reader = sequences.file_reader(options.mate_file)

    for seq in seq_reader:
        if options.mate_file:
            try:
                mate_seq = next(mate_seq_reader)
            except StopIteration:
                print('Error! Didn\'t get mate for read',
                      seq.id,
                      file=sys.stderr)
                sys.exit(1)
        if random.randint(0, 100) <= options.percent:
            print(seq, file=fout)
            if options.mate_file:
                print(mate_seq, file=fout)

    utils.close(fout)
Exemple #34
0
    def test_file_reader_phylip(self):
        '''Test read phylip file'''
        test_files = [
            'sequences_test_phylip.interleaved',
            'sequences_test_phylip.interleaved2',
            'sequences_test_phylip.sequential'
        ]

        test_files = [os.path.join(data_dir, f) for f in test_files]

        expected_seqs = [
            sequences.Fasta('Turkey', 'AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT'),
            sequences.Fasta('Salmo_gair', 'AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT'),
            sequences.Fasta('H. Sapiens', 'ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA')
        ]

        for fname in test_files:
            reader = sequences.file_reader(fname)
            i = 0
            for seq in reader:
                self.assertEqual(expected_seqs[i], seq)
                i += 1

        # files made by seaview are a little different in the first line.
        # Test one of these
        expected_seqs = [
            sequences.Fasta('seq1', 96 * 'G' + 'T'),
            sequences.Fasta('seq2', 94 * 'A' + 'G')
        ]

        reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_phylip.made_by_seaview'))
        i = 0
        for seq in reader:
            print(seq)
            self.assertEqual(expected_seqs[i], seq)
            i += 1
Exemple #35
0
def fastaq_to_mira_xml(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)
    print('<?xml version="1.0"?>', '<trace_volume>', sep='\n', file=fout)

    for seq in seq_reader:
        print('    <trace>',
              '        <trace_name>' + seq.id + '</trace_name>',
              '        <clip_quality_right>' + str(len(seq)) + '</clip_quality_right>',
              '        <clip_vector_left>1</clip_vector_left>',
              '    </trace>', sep='\n', file=fout)


    print('</trace_volume>', file=fout)
    utils.close(fout)
Exemple #36
0
def fastaq_to_mira_xml(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)
    print('<?xml version="1.0"?>', '<trace_volume>', sep='\n', file=fout)

    for seq in seq_reader:
        print('    <trace>',
              '        <trace_name>' + seq.id + '</trace_name>',
              '        <clip_quality_right>' + str(len(seq)) + '</clip_quality_right>',
              '        <clip_vector_left>1</clip_vector_left>',
              '    </trace>', sep='\n', file=fout)


    print('</trace_volume>', file=fout)
    utils.close(fout)
Exemple #37
0
    def test_file_reader_embl(self):
        '''Test read embl file'''
        reader = sequences.file_reader(
            os.path.join(data_dir, 'sequences_test.embl'))

        counter = 1
        for seq in reader:
            self.assertEqual(
                seq,
                sequences.Fasta('seq' + str(counter),
                                expected_embl[counter - 1]))
            counter += 1

        bad_files = [
            'sequences_test.embl.bad',
            'sequences_test.embl.bad2',
        ]
        bad_files = [os.path.join(data_dir, x) for x in bad_files]

        for filename in bad_files:
            with self.assertRaises(sequences.Error):
                reader = sequences.file_reader(filename)
                for seq in reader:
                    pass
Exemple #38
0
    def test_print_line_length(self):
        '''__str__ should be formatted correctly with the right number of chars per line of sequence'''
        line_lengths = [0, 3]
        correct_files = [os.path.join(data_dir, x) for x in ['sequences_test_one-per-line.fa', 'sequences_test_3-per-line.fa']]

        for i in range(len(line_lengths)):
            seq_reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_one-per-line.fa'))
            sequences.Fasta.line_length = line_lengths[i]
            tmp_out = 'tmp.line_length_test.fa'
            f = utils.open_file_write(tmp_out)
            for s in seq_reader:
                print(s, file=f)
            utils.close(f)
            self.assertTrue(filecmp.cmp(correct_files[i], tmp_out))
            os.unlink(tmp_out)

        sequences.Fasta.line_length = 60
Exemple #39
0
def capillary_to_pairs(infile, outprefix):
    # hash the sequences, only taking longest where an end has been sequenced more than once
    seq_reader = sequences.file_reader(infile)
    fwd_seqs = {}
    rev_seqs = {}
    unpaired_seqs = {}

    for seq in seq_reader:
        id_info = seq.split_capillary_id()
        if id_info['dir'] == 'fwd':
            seq.id = id_info['prefix'] + '/1'
            h = fwd_seqs
        elif id_info['dir'] == 'rev':
            seq.id = id_info['prefix'] + '/2'
            h = rev_seqs
        else:
            seq.id = id_info['prefix']
            h = unpaired_seqs

        key = id_info['prefix']

        if key not in h or len(h[key]) < len(seq):
            h[key] = copy.copy(seq)

    # write the output files
    f_pe = utils.open_file_write(outprefix + '.paired.gz')
    f_up = utils.open_file_write(outprefix + '.unpaired.gz')

    for id in fwd_seqs:
        if id in rev_seqs:
            print(fwd_seqs[id], file=f_pe)
            print(rev_seqs[id], file=f_pe)
            del rev_seqs[id]
        else:
            print(fwd_seqs[id], file=f_up)

    for seq in rev_seqs.values():
        print(seq, file=f_up)

    for seq in unpaired_seqs.values():
        print(seq, file=f_up)

    utils.close(f_pe)
    utils.close(f_up)
Exemple #40
0
def scaffolds_to_contigs(infile, outfile, number_contigs=False):
    '''Makes a file of contigs from scaffolds by splitting at every N.
       Use number_contigs=True to add .1, .2, etc onto end of each
       contig, instead of default to append coordinates.'''
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        contigs = seq.contig_coords()
        counter = 1
        for contig in contigs:
            if number_contigs:
                name = seq.id + '.' + str(counter)
                counter += 1
            else:
                name = '.'.join([seq.id, str(contig.start + 1), str(contig.end + 1)])
            print(sequences.Fasta(name, seq[contig.start:contig.end+1]), file=fout)

    utils.close(fout)
Exemple #41
0
def get_fast5(fastq, fast5, subset, batch_size, outdir, extension, threads):
    """ Get Fast5 reads from basecalled Fastq """

    outdir.mkdir(parents=True, exist_ok=True)
    fq_files = fastq.glob(f"*{extension}")

    if subset:
        df = pandas.read_csv(subset, sep='\t')
        names = df.iloc[:, 0].tolist()
        print(f"Subset: {names}")
    else:
        names = []

    for fq in fq_files:

        name = fq.stem

        # Subset checks
        if names:
            if name in names:
                pass
            else:
                print(f"{name} not in subset, ignoring")
                continue

        # Create files:

        read_ids = [seq.id for seq in sequences.file_reader(str(fq))]
        (outdir / name).mkdir(parents=True, exist_ok=True)

        read_id_list = outdir / name / f"{name}.txt"
        with open(read_id_list, 'w') as outfile:
            for read_id in read_ids:
                outfile.write(read_id + '\n')

        # Run ONT Fast5 API:

        print(f"Fetching Fast5 for: {fq}")

        run_cmd(
            f"fast5_subset --input {fast5} --save_path {outdir / name} --read_id_list {read_id_list} "
            f"--batch_size {batch_size} --recursive --filename_base {name}_ --threads {threads}"
        )
Exemple #42
0
def get_seqs_flanking_gaps(infile, outfile, left, right):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    print('#id', 'gap_start', 'gap_end', 'left_bases', 'right_bases', sep='\t', file=fout)

    for seq in seq_reader:
        gaps = seq.gaps()

        for gap in gaps:
            left_start = max(gap.start - left, 0)
            right_end = min(gap.end + right + 1, len(seq))
            print(seq.id,
                  gap.start + 1,
                  gap.end + 1,
                  seq.seq[left_start:gap.start],
                  seq.seq[gap.end + 1:right_end],
                  sep='\t', file=fout)

    utils.close(fout)
Exemple #43
0
def to_fasta(infile,
             outfile,
             line_length=60,
             strip_after_first_whitespace=False):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    original_line_length = sequences.Fasta.line_length
    sequences.Fasta.line_length = line_length

    for seq in seq_reader:
        if strip_after_first_whitespace:
            seq.strip_after_first_whitespace()

        if type(seq) == sequences.Fastq:
            print(sequences.Fasta(seq.id, seq.seq), file=f_out)
        else:
            print(seq, file=f_out)

    utils.close(f_out)
    sequences.Fasta.line_length = original_line_length
Exemple #44
0
def enumerate_names(infile,
                    outfile,
                    start_index=1,
                    keep_illumina_suffix=False,
                    rename_file=None,
                    suffix=None):
    seq_reader = sequences.file_reader(infile)
    fout_seqs = utils.open_file_write(outfile)
    counter = start_index

    if keep_illumina_suffix:
        sequence_suffixes = ['/1', '/2']
    else:
        sequence_suffixes = []

    if rename_file is not None:
        fout_rename = utils.open_file_write(rename_file)
        print('#old\tnew', file=fout_rename)

    for seq in seq_reader:
        old_id = seq.id
        seq.id = str(counter)

        for suff in sequence_suffixes:
            if old_id.endswith(suff):
                seq.id += suff
                break

        if rename_file is not None:
            print(old_id, seq.id, sep='\t', file=fout_rename)

        if suffix is not None:
            seq.id += suffix

        print(seq, file=fout_seqs)
        counter += 1

    utils.close(fout_seqs)

    if rename_file is not None:
        utils.close(fout_rename)
Exemple #45
0
def merge_to_one_seq(infile, outfile, seqname='union'):
    '''Takes a multi fasta or fastq file and writes a new file that contains just one sequence, with the original sequences catted together, preserving their order'''
    seq_reader = sequences.file_reader(infile)
    seqs = []

    for seq in seq_reader:
        seqs.append(copy.copy(seq))

    new_seq = ''.join([seq.seq for seq in seqs])

    if type(seqs[0]) == sequences.Fastq:
        new_qual = ''.join([seq.qual for seq in seqs])
        seqs[:] = []
        merged = sequences.Fastq(seqname, new_seq, new_qual)
    else:
        merged = sequences.Fasta(seqname, new_seq)
        seqs[:] = []

    f = utils.open_file_write(outfile)
    print(merged, file=f)
    utils.close(f)
Exemple #46
0
def split_by_fixed_size_onefile(infile, outfile, chunk_size, tolerance, skip_if_all_Ns=False):
    '''Splits each sequence in infile into chunks of fixed size, last chunk can be up to
       (chunk_size + tolerance) in length'''
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    for seq in seq_reader:
        for i in range(0, len(seq), chunk_size):
            if i + chunk_size + tolerance >= len(seq):
                end = len(seq)
            else:
                end = i + chunk_size

            subseq = seq.subseq(i, end)
            if not (skip_if_all_Ns and subseq.is_all_Ns()):
                subseq.id += '.' + str(i+1) + '_' + str(end)
                print(subseq, file=f_out)

            if end == len(seq):
                break

    utils.close(f_out)
Exemple #47
0
def to_unique_by_id(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    seqs = {}
    ids_in_order = []

    # has the reads, keeping the longest one when we get the same
    # name more than once
    for seq in seq_reader:
        if len(seq) == 0:
           continue
        if seq.id not in seqs:
            seqs[seq.id] = copy.copy(seq)
            ids_in_order.append(seq.id)
        elif len(seqs[seq.id]) < len(seq):
            seqs[seq.id] = copy.copy(seq)

    # write the output
    f_out = utils.open_file_write(outfile)
    for id in ids_in_order:
        print(seqs[id], file=f_out)
    utils.close(f_out)
Exemple #48
0
def to_fasta(infile,
             outfile,
             line_length=60,
             strip_after_first_whitespace=False,
             check_unique=False):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    original_line_length = sequences.Fasta.line_length
    sequences.Fasta.line_length = line_length
    if check_unique:
        used_names = {}

    for seq in seq_reader:
        if strip_after_first_whitespace:
            seq.strip_after_first_whitespace()

        if check_unique:
            used_names[seq.id] = used_names.get(seq.id, 0) + 1

        if type(seq) == sequences.Fastq:
            print(sequences.Fasta(seq.id, seq.seq), file=f_out)
        else:
            print(seq, file=f_out)

    utils.close(f_out)
    sequences.Fasta.line_length = original_line_length

    if check_unique:
        all_unique = True

        for name, count in used_names.items():
            if count > 1:
                print('Sequence name "' + name + '" not unique. Found',
                      count,
                      'times',
                      file=sys.stderr)
                all_unique = False

        if not all_unique:
            raise Error('Not all sequence names unique. Cannot continue')
Exemple #49
0
def deinterleave(infile, outfile_1, outfile_2, fasta_out=False):
    seq_reader = sequences.file_reader(infile)
    f_1 = utils.open_file_write(outfile_1)
    f_2 = utils.open_file_write(outfile_2)
    for seq in seq_reader:
        if fasta_out:
            print(sequences.Fasta(seq.id, seq.seq), file=f_1)
        else:
            print(seq, file=f_1)
        try:
            next(seq_reader)
        except StopIteration:
            utils.close(f_1)
            utils.close(f_2)
            raise Error('Error getting mate for sequence. Cannot continue')
        if fasta_out:
            print(sequences.Fasta(seq.id, seq.seq), file=f_2)
        else:
            print(seq, file=f_2)

    utils.close(f_1)
    utils.close(f_2)
Exemple #50
0
def to_fastg(infile, outfile, circular=None):
    '''Writes a FASTG file in SPAdes format from input file. Currently only whether or not a sequence is circular is supported. Put circular=set of ids, or circular=filename to make those sequences circular in the output. Puts coverage=1 on all contigs'''
    if circular is None:
        to_circularise = set()
    elif type(circular) is not set:
        f = utils.open_file_read(circular)
        to_circularise = set([x.rstrip() for x in f.readlines()])
        utils.close(f)
    else:
        to_circularise = circular

    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)
    nodes = 1

    for seq in seq_reader:
        new_id = '_'.join([
            'NODE', str(nodes),
            'length', str(len(seq)),
            'cov', '1',
            'ID', seq.id
        ])

        if seq.id in to_circularise:
            seq.id = new_id + ':' + new_id + ';'
            print(seq, file=fout)
            seq.revcomp()
            seq.id = new_id + "':" + new_id + "';"
            print(seq, file=fout)
        else:
            seq.id = new_id + ';'
            print(seq, file=fout)
            seq.revcomp()
            seq.id = new_id + "';"
            print(seq, file=fout)

        nodes += 1

    utils.close(fout)
Exemple #51
0
def split_by_base_count(infile, outfiles_prefix, max_bases, max_seqs=None):
    '''Splits a fasta/q file into separate files, file size determined by number of bases.

    Puts <= max_bases in each split file The exception is a single sequence >=max_bases
    is put in its own file.  This does not split sequences.
    '''
    seq_reader = sequences.file_reader(infile)
    base_count = 0
    file_count = 1
    seq_count = 0
    fout = None
    if max_seqs is None:
        max_seqs = float('inf')

    for seq in seq_reader:
        if base_count == 0:
            fout = utils.open_file_write(outfiles_prefix + '.' +
                                         str(file_count))
            file_count += 1

        if base_count + len(seq) > max_bases or seq_count >= max_seqs:
            if base_count == 0:
                print(seq, file=fout)
                utils.close(fout)
            else:
                utils.close(fout)
                fout = utils.open_file_write(outfiles_prefix + '.' +
                                             str(file_count))
                print(seq, file=fout)
                base_count = len(seq)
                file_count += 1
                seq_count = 1
        else:
            base_count += len(seq)
            seq_count += 1
            print(seq, file=fout)

    utils.close(fout)
Exemple #52
0
def filter(
    infile,
    outfile,
    minlength=0,
    maxlength=float('inf'),
    regex=None,
    ids_file=None,
    invert=False,
    mate_in=None,
    mate_out=None,
    both_mates_pass=True,
):

    ids_from_file = set()
    if ids_file is not None:
        f = utils.open_file_read(ids_file)
        for line in f:
            ids_from_file.add(line.rstrip())
        utils.close(f)

    if mate_in:
        if mate_out is None:
            raise Error(
                'Error in filter! mate_in provided. Must also provide mate_out'
            )

        seq_reader_mate = sequences.file_reader(mate_in)
        f_out_mate = utils.open_file_write(mate_out)

    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    if regex is not None:
        r = re.compile(regex)

    def passes(seq, name_regex):
        # remove trailing comments from FASTQ readname lines
        matches = name_regex.match(seq.id)
        if matches is not None:
            clean_seq_id = matches.group(1)
        else:
            clean_seq_id = seq.id

        return minlength <= len(seq) <= maxlength \
              and (regex is None or r.search(clean_seq_id) is not None) \
              and (ids_file is None or clean_seq_id in ids_from_file)

    name_regex = re.compile(r'^([^\s]+).*?$')

    for seq in seq_reader:
        seq_passes = passes(seq, name_regex)
        if mate_in:
            try:
                seq_mate = next(seq_reader_mate)
            except:
                utils.close(f_out)
                raise Error('Error getting mate for sequence', seq.id,
                            ' ... cannot continue')

            mate_passes = passes(seq_mate, name_regex)
            want_the_pair = (seq_passes and mate_passes) \
                            or (( seq_passes or mate_passes) and not both_mates_pass)
            if want_the_pair != invert:
                print(seq, file=f_out)
                print(seq_mate, file=f_out_mate)
        elif seq_passes != invert:
            print(seq, file=f_out)
    utils.close(f_out)
    if mate_in:
        utils.close(f_out_mate)
Exemple #53
0
def split_by_fixed_size(infile,
                        outfiles_prefix,
                        chunk_size,
                        tolerance,
                        skip_if_all_Ns=False):
    '''Splits  fasta/q file into separate files, with up to (chunk_size + tolerance) bases in each file'''
    file_count = 1
    coords = []
    small_sequences = []  # sequences shorter than chunk_size
    seq_reader = sequences.file_reader(infile)
    f_coords = utils.open_file_write(outfiles_prefix + '.coords')

    for seq in seq_reader:
        if skip_if_all_Ns and seq.is_all_Ns():
            continue
        if len(seq) < chunk_size:
            small_sequences.append(copy.copy(seq))
        elif len(seq) <= chunk_size + tolerance:
            f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
            print(seq, file=f)
            utils.close(f)
            file_count += 1
        else:
            # make list of chunk coords
            chunks = [(x, x + chunk_size)
                      for x in range(0, len(seq), chunk_size)]
            if chunks[-1][1] - 1 > len(seq):
                chunks[-1] = (chunks[-1][0], len(seq))
            if len(chunks) > 1 and (chunks[-1][1] -
                                    chunks[-1][0]) <= tolerance:
                chunks[-2] = (chunks[-2][0], chunks[-1][1])
                chunks.pop()

            # write one output file per chunk
            offset = 0
            for chunk in chunks:
                if not (skip_if_all_Ns
                        and seq.is_all_Ns(start=chunk[0], end=chunk[1] - 1)):
                    f = utils.open_file_write(outfiles_prefix + '.' +
                                              str(file_count))
                    chunk_id = seq.id + ':' + str(chunk[0] + 1) + '-' + str(
                        chunk[1])
                    print(sequences.Fasta(chunk_id, seq[chunk[0]:chunk[1]]),
                          file=f)
                    print(chunk_id, seq.id, offset, sep='\t', file=f_coords)
                    utils.close(f)
                    file_count += 1

                offset += chunk[1] - chunk[0]

    # write files of small sequences
    if len(small_sequences):
        f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
        file_count += 1
        base_count = 0
        for seq in small_sequences:
            if base_count > 0 and base_count + len(
                    seq) > chunk_size + tolerance:
                utils.close(f)
                f = utils.open_file_write(outfiles_prefix + '.' +
                                          str(file_count))
                file_count += 1
                base_count = 0

            print(seq, file=f)
            base_count += len(seq)

        utils.close(f)
Exemple #54
0
 def test_file_reader_bad_format(self):
     '''file_reader should die properly when not given fasta or fastq file'''
     with self.assertRaises(sequences.Error):
         reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_not_a_fastaq_file'))
         for seq in reader:
             pass
Exemple #55
0
 def test_file_reader_fastq(self):
     '''file_reader should iterate through a fastq file correctly'''
     reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_good_file.fq'))
     for seq in reader:
         self.assertEqual(seq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
Exemple #56
0
def file_to_dict(infile, d):
    seq_reader = sequences.file_reader(infile)
    for seq in seq_reader:
        d[seq.id] = copy.copy(seq)
Exemple #57
0
def get_ids(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    for seq in seq_reader:
        print(seq.id, file=f_out)
    utils.close(f_out)
	def _get_max_length(self, filename):
		'''Get the length of the longest contig in file'''
		contigs = sequences.file_reader(filename)
		max_length = max([len(x) for x in contigs])
		return max_length + 20  # adding 20 for extra allowance when running promer
def run(description):
    parser = argparse.ArgumentParser(
        description = 'Makes perfect paired end fastq reads from a sequence file, with insert sizes sampled from a normal distribution. Read orientation is innies. Output is an interleaved FASTQ file.',
        usage = 'fastaq to_perfect_reads [options] <infile> <outfile> <mean insert size> <insert std deviation> <mean coverage> <read length>')
    parser.add_argument('infile', help='Name of input file')
    parser.add_argument('outfile', help='Name of output file')
    parser.add_argument('mean_insert', type=int, help='Mean insert size of read pairs', metavar='mean insert size')
    parser.add_argument('insert_std', type=float, help='Standard devation of insert size', metavar='insert std deviation')
    parser.add_argument('coverage', type=float, help='Mean coverage of the reads', metavar='mean coverage')
    parser.add_argument('readlength', type=int, help='Length of each read', metavar='read length')
    parser.add_argument('--fragments', help='Write FASTA sequences of fragments (i.e. read pairs plus sequences in between them) to the given filename', metavar='FILENAME')
    parser.add_argument('--no_n', action='store_true', help='Don\'t allow any N or n characters in the reads')
    parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None, metavar='INT')
    options = parser.parse_args()

    random.seed(a=options.seed)

    seq_reader = sequences.file_reader(options.infile)
    fout = utils.open_file_write(options.outfile)
    pair_counter = 1

    if options.fragments:
        fout_frags = utils.open_file_write(options.fragments)

    for ref in seq_reader:
        # check if current seq is long enough
        if len(ref) < options.mean_insert + 4 * options.insert_std:
            print('Warning, sequence ', ref.id, ' too short.  Skipping it...', file=sys.stderr)
            continue

        # work out how many reads to simulate
        read_pairs = int(0.5 * options.coverage * len(ref) / options.readlength)

        # it's possible that we pick the same fragment twice, in which case the
        # reads would get the same name. So remember the frag coords
        used_fragments = {}  # (middle_position, length) => count

        # do the simulation:  pick insert size from normal distribution, and
        # position in genome from uniform distribution
        x = 0
        while x < read_pairs:
            isize = int(random.normalvariate(options.mean_insert, options.insert_std))
            while isize > len(ref) or isize < options.readlength:
                isize = int(random.normalvariate(options.mean_insert, options.insert_std))
            middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize))
            read_start1 = int(middle_pos - ceil(0.5 * isize))
            read_start2 = read_start1 + isize - options.readlength

            readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)])

            fragment = (middle_pos, isize)
            if fragment in used_fragments:
                used_fragments[fragment] += 1
                readname += '.dup.' + str(used_fragments[fragment])
            else:
                used_fragments[fragment] = 1

            read1 = sequences.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength)
            read2 = sequences.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength)


            if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq):
                continue

            read2.revcomp()

            print(read1, file=fout)
            print(read2, file=fout)

            if options.fragments:
                frag = sequences.Fasta(readname, ref.seq[read_start1:read_start2 + options.readlength])
                print(frag, file=fout_frags)

            pair_counter += 1
            x += 1

    utils.close(fout)
    if options.fragments:
        utils.close(fout_frags)
Exemple #60
0
def run(description):
    parser = argparse.ArgumentParser(
        description=
        'Takes a sequence file. Makes a BAM file containing perfect (unpaired) reads tiling the whole genome',
        usage=
        'fastaq to_tiling_bam [options] <infile> <read_length> <read_step> <read_prefix> <outfile>',
        epilog='Important: assumes that samtools is in your path')
    parser.add_argument('infile', help='Name of input fasta/q file')
    parser.add_argument('read_length', type=int, help='Length of reads')
    parser.add_argument('read_step',
                        type=int,
                        help='Distance between start of each read')
    parser.add_argument('read_prefix', help='Prefix of read names')
    parser.add_argument('outfile', help='Name of output BAM file')
    parser.add_argument(
        '--read_group',
        help='Add the given read group ID to all reads [%(default)s]',
        default='42')
    options = parser.parse_args()

    # make a header first  - we need to add the @RG line to the default header made by samtools
    tmp_empty_file = options.outfile + '.tmp.empty'
    f = utils.open_file_write(tmp_empty_file)
    utils.close(f)
    try:
        f = os.popen('samtools view -H -T ' + options.infile + ' ' +
                     tmp_empty_file)
    except IOError:
        print('Error making tmp header file', file=sys.stderr)
        sys.exit(1)

    header_lines = f.readlines()
    header_lines.append('@RG\tID:' + options.read_group + '\tSM:FAKE')
    f.close()
    os.unlink(tmp_empty_file)

    seq_reader = sequences.file_reader(options.infile)
    try:
        f = os.popen('samtools view -hbS - > ' + options.outfile, 'w')
    except IOError:
        print("Error opening for writing BAM file '" + options.outfile + "'",
              file=sys.stderr)
        sys.exit(1)

    print(''.join(header_lines), file=f)

    for seq in seq_reader:
        end_range = len(seq)
        if len(seq) < options.read_length:
            end_range = 1
        for i in range(0, end_range, options.read_step):
            if len(seq) <= options.read_length:
                start = 0
                end = len(seq) - 1
            else:
                start = i
                end = start + options.read_length - 1

                if end > len(seq) - 1:
                    end = len(seq) - 1
                    start = end - options.read_length + 1

            read = sequences.Fastq(
                options.read_prefix + ':' + seq.id + ':' + str(start + 1) +
                ':' + str(end + 1), seq[start:end + 1],
                'I' * (end - start + 1))

            print('\t'.join([
                read.id, '0', seq.id,
                str(start + 1), '60',
                str(len(read)) + 'M', '*', '*', '*', read.seq, read.qual,
                'RG:Z:' + options.read_group
            ]),
                  file=f)

            if end == len(seq) - 1:
                break

    f.close()