Beispiel #1
0
def get_assembly_stats(options):
    f = utils.open_file_read(options.infile)
    csv_headers = []
    stats = {}
    float_headers = set([
        'Avg Contig Length',
        'Average Quality',
        'Insert Size Average',
        'Insert Size Std Dev'
    ])

    for line in f:
        if len(csv_headers) == 0:
            csv_headers = line.rstrip().split('\t')[2:]
            stats = {k:[] for k in csv_headers}
        else:
            data = line.rstrip().split('\t')[2:]
            assert len(data) == len(csv_headers) == len(stats)
            for i in range(len(data)):
                if csv_headers[i] in float_headers:
                    stats[csv_headers[i]].append(float(data[i]))
                else:
                    stats[csv_headers[i]].append(int(data[i]))

    utils.close(f)
    return stats
Beispiel #2
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file'''
        bad_files = [
            'sequences_test_fail_no_AT.fq', 'sequences_test_fail_no_seq.fq',
            'sequences_test_fail_no_plus.fq', 'sequences_test_fail_no_qual.fq'
        ]

        bad_files = [os.path.join(data_dir, x) for x in bad_files]

        for fname in bad_files:
            f_in = utils.open_file_read(fname)
            fq = sequences.Fastq()
            with self.assertRaises(sequences.Error):
                while fq.get_next_from_file(f_in):
                    pass

            utils.close(f_in)

        fname = os.path.join(data_dir, 'sequences_test_good_file.fq')
        try:
            f_in = open(fname)
        except IOError:
            print("Error opening '" + fname + "'", file=sys.stderr)
            sys.exit(1)

        fq = sequences.Fastq()
        while fq.get_next_from_file(f_in):
            self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
        utils.close(f_in)
def nucmer_file_reader(fname):
    f = utils.open_file_read(fname)
    in_header = True

    for line in f:
        if in_header:
            if line.startswith('['):
                in_header = False
            continue
        yield NucmerHit(line)

    utils.close(f)
Beispiel #4
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read seqs from OK, including weirdness in file'''
        f_in = utils.open_file_read(os.path.join(data_dir,
                                                 'sequences_test.fa'))
        fa = sequences.Fasta()
        counter = 1

        while fa.get_next_from_file(f_in):
            self.assertEqual(fa, sequences.Fasta(str(counter), 'ACGTA'))
            counter += 1

        utils.close(f_in)
Beispiel #5
0
    def test_write_and_read(self):
        '''open_file_write() and open_file_read() should do the right thing depending gzipped or not'''
        for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']:
            f = utils.open_file_write(filename)
            for i in range(3):
                print(i, file=f)
            utils.close(f)

            counter = 0

            f = utils.open_file_read(filename)
            for line in f:
                self.assertEqual(counter, int(line.strip()))
                counter += 1
            utils.close(f)

            os.unlink(filename)

        f = utils.open_file_read('-')
        self.assertEqual(sys.stdin, f)
        f = utils.open_file_write('-')
        self.assertEqual(sys.stdout, f)
Beispiel #6
0
    def test_write_and_read(self):
        '''open_file_write() and open_file_read() should do the right thing depending gzipped or not'''
        for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']:
            f = utils.open_file_write(filename)
            for i in range(3):
                print(i, file=f)
            utils.close(f)

            counter = 0

            f = utils.open_file_read(filename)
            for line in f:
                self.assertEqual(counter, int(line.strip()))
                counter += 1
            utils.close(f)

            os.unlink(filename)

        f = utils.open_file_read('-')
        self.assertEqual(sys.stdin, f)
        f = utils.open_file_write('-')
        self.assertEqual(sys.stdout, f)
Beispiel #7
0
    def test_get_next_from_embl_file(self):
        f_in = utils.open_file_read(
            os.path.join(data_dir, 'sequences_test.embl'))
        embl = sequences.Embl()
        counter = 1

        while embl.get_next_from_file(f_in):
            self.assertEqual(
                embl,
                sequences.Fasta('seq' + str(counter),
                                expected_embl[counter - 1]))
            counter += 1

        utils.close(f_in)
Beispiel #8
0
    def test_get_next_from_gbk_file(self):
        f_in = utils.open_file_read(
            os.path.join(data_dir, 'sequences_test.gbk'))
        embl = sequences.Embl()
        counter = 1
        expected = [
            'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgatc',
            'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgaaa'
        ]

        while embl.get_next_from_file(f_in):
            self.assertEqual(
                embl,
                sequences.Fasta('NAME' + str(counter), expected[counter - 1]))
            counter += 1

        utils.close(f_in)
Beispiel #9
0
    def test_raise_exception(self):
        '''open_file_write() and open_file_read() should raise an exception when can't do the opening'''
        with self.assertRaises(utils.Error):
            utils.open_file_read('this_file_is_not_here_so_throw_error')
        with self.assertRaises(utils.Error):
            utils.open_file_read('this_file_is_not_here_so_throw_error.gz')
        with self.assertRaises(utils.Error):
            utils.open_file_read(os.path.join(data_dir, 'utils_test_not_really_zipped.gz'))

        with self.assertRaises(utils.Error):
            utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error'))
        with self.assertRaises(utils.Error):
            utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
Beispiel #10
0
def filter(infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False):
    ids_from_file = set()
    if ids_file is not None:
        f = utils.open_file_read(ids_file)
        for line in f:
            ids_from_file.add(line.rstrip())
        utils.close(f)

    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    if regex is not None:
        r = re.compile(regex)

    for seq in seq_reader:
        hit = minlength <= len(seq) <= maxlength \
              and (regex is None or r.search(seq.id) is not None) \
              and (ids_file is None or seq.id in ids_from_file)

        if hit != invert:
            print(seq, file=f_out)
    utils.close(f_out)
Beispiel #11
0
    def test_raise_exception(self):
        '''open_file_write() and open_file_read() should raise an exception when can't do the opening'''
        with self.assertRaises(utils.Error):
            utils.open_file_read('this_file_is_not_here_so_throw_error')
        with self.assertRaises(utils.Error):
            utils.open_file_read('this_file_is_not_here_so_throw_error.gz')
        with self.assertRaises(utils.Error):
            utils.open_file_read(
                os.path.join(data_dir, 'utils_test_not_really_zipped.gz'))

        with self.assertRaises(utils.Error):
            utils.open_file_write(
                os.path.join('not_a_directory',
                             'this_file_is_not_here_so_throw_error'))
        with self.assertRaises(utils.Error):
            utils.open_file_write(
                os.path.join('not_a_directory',
                             'this_file_is_not_here_so_throw_error.gz'))
Beispiel #12
0
def lengths_from_fai(fai_file, d):
    f = utils.open_file_read(fai_file)
    for line in f:
        (id, length) = line.rstrip().split()[:2]
        d[id] = int(length)
    utils.close(f)
Beispiel #13
0
def file_reader(fname, read_quals=False):
    '''Iterates over a FASTA or FASTQ file, yielding the next sequence in the file until there are no more sequences'''
    f = utils.open_file_read(fname)
    line = f.readline()
    phylip_regex = re.compile('^\s*[0-9]+\s+[0-9]+$')
    gbk_regex = re.compile('^LOCUS\s+\S')

    if line.startswith('>'):
        seq = Fasta()
        previous_lines[f] = line
    elif line.startswith('##gff-version 3'):
        seq = Fasta()
        # if a GFF file, need to skip past all the annotation
        # and get to the fasta sequences at the end of the file
        while not line.startswith('>'):
            line = f.readline()
            if not line:
                utils.close(f)
                raise Error('No sequences found in GFF file "' + fname + '"')

        seq = Fasta()
        previous_lines[f] = line
    elif line.startswith('ID   ') and line[5] != ' ':
        seq = Embl()
        previous_lines[f] = line
    elif gbk_regex.search(line):
        seq = Embl()
        previous_lines[f] = line
    elif line.startswith('@'):
        seq = Fastq()
        previous_lines[f] = line
    elif phylip_regex.search(line):
        # phylip format could be interleaved or not, need to look at next
        # couple of lines to figure that out. Don't expect these files to
        # be too huge, so just store all the sequences in memory
        number_of_seqs, bases_per_seq = line.strip().split()
        number_of_seqs = int(number_of_seqs)
        bases_per_seq = int(bases_per_seq)
        got_blank_line = False

        first_line = line
        seq_lines = []
        while 1:
            line = f.readline()
            if line == '':
                break
            elif line == '\n':
                got_blank_line = True
            else:
                seq_lines.append(line.rstrip())
        utils.close(f)

        if len(seq_lines) == 1 or len(seq_lines) == number_of_seqs:
            sequential = True
        elif seq_lines[0][10] != ' ' and seq_lines[1][10] == ' ':
            sequential = True
        else:
            sequential = False

        # if the 11th char of second sequence line is a space,  then the file is sequential, e.g.:
        # GAGCCCGGGC AATACAGGGT AT
        # as opposed to:
        # Salmo gairAAGCCTTGGC AGTGCAGGGT
        if sequential:
            current_id = None
            current_seq = ''
            for line in seq_lines:
                if len(current_seq) == bases_per_seq or len(current_seq) == 0:
                    if current_id is not None:
                        yield Fasta(current_id, current_seq.replace('-', ''))
                    current_seq = ''
                    current_id, new_bases = line[0:10].rstrip(), line.rstrip(
                    )[10:]
                else:
                    new_bases = line.rstrip()

                current_seq += new_bases.replace(' ', '')

            yield Fasta(current_id, current_seq.replace('-', ''))
        else:
            # seaview files start all seqs at pos >=12. Other files start
            # their sequence at the start of the line
            if seq_lines[number_of_seqs + 1][0] == ' ':
                first_gap_pos = seq_lines[0].find(' ')
                end_of_gap = first_gap_pos
                while seq_lines[0][end_of_gap] == ' ':
                    end_of_gap += 1
                first_seq_base = end_of_gap
            else:
                first_seq_base = 10

            seqs = []
            for i in range(number_of_seqs):
                name, bases = seq_lines[i][0:first_seq_base].rstrip(
                ), seq_lines[i][first_seq_base:]
                seqs.append(Fasta(name, bases))

            for i in range(number_of_seqs, len(seq_lines)):
                seqs[i % number_of_seqs].seq += seq_lines[i]

            for fa in seqs:
                fa.seq = fa.seq.replace(' ', '').replace('-', '')
                yield fa

        return
    elif line == '':
        utils.close(f)
        return
    else:
        utils.close(f)
        raise Error('Error determining file type from file "' + fname +
                    '". First line is:\n' + line.rstrip())

    try:
        while seq.get_next_from_file(f, read_quals):
            yield seq
    finally:
        utils.close(f)