Esempio n. 1
0
def test_check_is_pair_4b():
    read1 = FakeFastaRead(name="seq/1", sequence="AAA")
    read2 = FakeFQRead(name="seq/2", quality="###", sequence="AAA")

    try:
        check_is_pair(read1, read2)
        assert False  # check_is_pair should fail here.
    except ValueError:
        pass
Esempio n. 2
0
def test_check_is_pair_4():
    read1 = FakeFQRead(name='seq/1', quality='###', sequence='AAA')
    read2 = FakeFastaRead(name='seq/2', sequence='AAA')

    try:
        check_is_pair(read1, read2)
        assert False                    # check_is_pair should fail here.
    except ValueError:
        pass
Esempio n. 3
0
def test_check_is_pair_4b():
    read1 = screed.Record(name='seq/1', sequence='AAA')
    read2 = screed.Record(name='seq/2', quality='###', sequence='AAA')

    try:
        check_is_pair(read1, read2)
        assert False                    # check_is_pair should fail here.
    except ValueError:
        pass
Esempio n. 4
0
def test_check_is_pair_4():
    read1 = screed.Record(name='seq/1', quality='###', sequence='AAA')
    read2 = screed.Record(name='seq/2', sequence='AAA')

    try:
        check_is_pair(read1, read2)
        assert False                    # check_is_pair should fail here.
    except ValueError:
        pass
Esempio n. 5
0
def test_check_is_pair_4b():
    read1 = FakeFastaRead(name='seq/1', sequence='AAA')
    read2 = FakeFQRead(name='seq/2', quality='###', sequence='AAA')

    try:
        check_is_pair(read1, read2)
        assert False                    # check_is_pair should fail here.
    except ValueError:
        pass
Esempio n. 6
0
    def push_sequences(self, inputiter):
        batch = []
        last_record = None
        i = 0
        for record in inputiter:
            if i >= self.group_size:
                # keep pairs together in batches, to retain the interleaving.
                if check_is_pair(last_record, record):
                    batch.append(record)
                    grouping = SequenceGroup(0, batch)
                    self.inqueue.put(grouping)

                    batch = []
                else:
                    grouping = SequenceGroup(0, batch)
                    self.inqueue.put(grouping)
                    batch = [record]

                i = 0
            else:
                batch.append(record)

            last_record = record
            i += 1

        # submit last set of sequences
        if batch:
            grouping = SequenceGroup(0, batch)
            self.inqueue.put(grouping)
Esempio n. 7
0
    def push_sequences(self, inputiter):
        batch = []
        last_record = None
        i = 0
        for record in inputiter:
            if i >= self.group_size:
                # keep pairs together in batches, to retain the interleaving.
                if check_is_pair(last_record, record):
                    batch.append(record)
                    g = SequenceGroup(0, batch)
                    self.inqueue.put(g)

                    batch = []
                else:
                    g = SequenceGroup(0, batch)
                    self.inqueue.put(g)
                    batch = [record]

                i = 0
            else:
                batch.append(record)

            last_record = record
            i += 1

        # submit last set of sequences
        if batch:
            g = SequenceGroup(0, batch)
            self.inqueue.put(g)
Esempio n. 8
0
def main():
    info('interleave-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.left, args.force)
    check_input_files(args.right, args.force)
    check_space([args.left, args.right], args.force)

    s1_file = args.left
    s2_file = args.right

    fail = False

    print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)

    outfp = get_file_writer(args.output, args.gzip, args.bzip)

    counter = 0
    screed_iter_1 = screed.open(s1_file)
    screed_iter_2 = screed.open(s2_file)
    for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print(("ERROR: Input files contain different number"
                   " of records."), file=sys.stderr)
            sys.exit(1)

        if counter % 100000 == 0:
            print('...', counter, 'pairs', file=sys.stderr)
        counter += 1

        name1 = read1.name
        name2 = read2.name

        if not args.no_reformat:
            if not check_is_left(name1):
                name1 += '/1'
            if not check_is_right(name2):
                name2 += '/2'

            read1.name = name1
            read2.name = name2

            if not check_is_pair(read1, read2):
                print("ERROR: This doesn't look like paired data! "
                      "%s %s" % (read1.name, read2.name), file=sys.stderr)
                sys.exit(1)

        write_record_pair(read1, read2, outfp)

    print('final: interleaved %d pairs' % counter, file=sys.stderr)
    print('output written to', describe_file_handle(outfp), file=sys.stderr)
Esempio n. 9
0
def main():
    info('interleave-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.left, args.force)
    check_input_files(args.right, args.force)
    check_space([args.left, args.right], args.force)

    s1_file = args.left
    s2_file = args.right

    print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)

    outfp = get_file_writer(args.output, args.gzip, args.bzip)

    counter = 0
    screed_iter_1 = screed.open(s1_file)
    screed_iter_2 = screed.open(s2_file)
    for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print(("ERROR: Input files contain different number"
                   " of records."),
                  file=sys.stderr)
            sys.exit(1)

        if counter % 100000 == 0:
            print('...', counter, 'pairs', file=sys.stderr)
        counter += 1

        name1 = read1.name
        name2 = read2.name

        if not args.no_reformat:
            if not check_is_left(name1):
                name1 += '/1'
            if not check_is_right(name2):
                name2 += '/2'

            read1.name = name1
            read2.name = name2

            if not check_is_pair(read1, read2):
                print("ERROR: This doesn't look like paired data! "
                      "%s %s" % (read1.name, read2.name),
                      file=sys.stderr)
                sys.exit(1)

        write_record_pair(read1, read2, outfp)

    print('final: interleaved %d pairs' % counter, file=sys.stderr)
    print('output written to', describe_file_handle(outfp), file=sys.stderr)
Esempio n. 10
0
def WithDiagnostics(ifile, batch_size, fp, paired, norm):
    """
    Generator/context manager to do boilerplate output of statistics while
    normalizing data. Also checks for properly paired data.
    """

    index = 0

    for index, batch in enumerate(batchwise(
                                  screed.open(ifile, parse_description=False),
                                  batch_size)):

        norm.total += batch_size
        total = norm.total
        discarded = norm.discarded

        if index > 0 and index % 100000 == 0:
            print('... kept {kept} of {total} or {perc:2}%'
                  .format(kept=total - discarded,
                          total=total,
                          perc=int(100. - discarded / float(total) * 100.)),
                  file=sys.stderr)

            print('... in file ' + input_filename, file=sys.stderr)

            if report_fp:
                print(total + " " + total - discarded + " " +
                      1. - (discarded / float(total)), file=fp)
                report_fp.flush()

        # If in paired mode, check that the reads are properly interleaved
        if paired:
            if not check_is_pair(batch[0], batch[1]):
                raise IOError('Error: Improperly interleaved pairs \
                    {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name))

        yield batch
Esempio n. 11
0
def test_check_is_pair_3_fa():
    read1 = FakeFastaRead(name="seq 1::", sequence="AAA")
    read2 = FakeFastaRead(name="seq 2::", sequence="AAA")

    assert check_is_pair(read1, read2)
Esempio n. 12
0
def test_check_is_pair_3_broken_fq_2():
    read1 = screed.Record(name='seq 1::', quality='###', sequence='AAA')
    read2 = screed.Record(name='seq', quality='###', sequence='AAA')

    assert not check_is_pair(read1, read2)
Esempio n. 13
0
def test_check_is_pair_3_fa():
    read1 = screed.Record(name='seq 1::', sequence='AAA')
    read2 = screed.Record(name='seq 2::', sequence='AAA')

    assert check_is_pair(read1, read2)
Esempio n. 14
0
def test_check_is_pair_7():
    read1 = FakeFastaRead(name="seq/2", sequence="AAA")
    read2 = FakeFastaRead(name="seq/1", sequence="AAA")

    assert not check_is_pair(read1, read2)
Esempio n. 15
0
def test_check_is_pair_2():
    read1 = screed.Record(name='seq/1', quality='###', sequence='AAA')
    read2 = screed.Record(name='seq/2', quality='###', sequence='AAA')

    assert check_is_pair(read1, read2)
Esempio n. 16
0
def test_check_is_pair_2():
    read1 = FakeFQRead(name='seq/1', quality='###', sequence='AAA')
    read2 = FakeFQRead(name='seq/2', quality='###', sequence='AAA')

    assert check_is_pair(read1, read2)
Esempio n. 17
0
def test_check_is_pair_3_broken_fq_1():
    read1 = screed.Record(name='seq', quality='###', sequence='AAA')
    read2 = screed.Record(name='seq 2::', quality='###', sequence='AAA')

    assert not check_is_pair(read1, read2)
Esempio n. 18
0
def test_check_is_pair_3_broken_fq_2():
    read1 = FakeFQRead(name="seq 1::", quality="###", sequence="AAA")
    read2 = FakeFQRead(name="seq", quality="###", sequence="AAA")

    assert not check_is_pair(read1, read2)
Esempio n. 19
0
def test_check_is_pair_2():
    read1 = FakeFQRead(name='seq/1', quality='###', sequence='AAA')
    read2 = FakeFQRead(name='seq/2', quality='###', sequence='AAA')

    assert check_is_pair(read1, read2)
Esempio n. 20
0
def test_check_is_pair_7():
    read1 = FakeFastaRead(name='seq/2', sequence='AAA')
    read2 = FakeFastaRead(name='seq/1', sequence='AAA')

    assert not check_is_pair(read1, read2)
Esempio n. 21
0
def test_check_is_pair_7():
    read1 = FakeFastaRead(name='seq/2', sequence='AAA')
    read2 = FakeFastaRead(name='seq/1', sequence='AAA')

    assert not check_is_pair(read1, read2)
Esempio n. 22
0
def test_check_is_pair_2():
    read1 = screed.Record(name='seq/1', quality='###', sequence='AAA')
    read2 = screed.Record(name='seq/2', quality='###', sequence='AAA')

    assert check_is_pair(read1, read2)
Esempio n. 23
0
def test_check_is_pair_3_fa():
    read1 = FakeFastaRead(name='seq 1::', sequence='AAA')
    read2 = FakeFastaRead(name='seq 2::', sequence='AAA')

    assert check_is_pair(read1, read2)
Esempio n. 24
0
def test_check_is_pair_3_broken_fq_1():
    read1 = FakeFQRead(name='seq', quality='###', sequence='AAA')
    read2 = FakeFQRead(name='seq 2::', quality='###', sequence='AAA')

    assert not check_is_pair(read1, read2)
Esempio n. 25
0
def test_check_is_pair_7():
    read1 = screed.Record(name='seq/2', sequence='AAA')
    read2 = screed.Record(name='seq/1', sequence='AAA')

    assert not check_is_pair(read1, read2)
Esempio n. 26
0
def test_check_is_pair_3_fa():
    read1 = FakeFastaRead(name='seq 1::', sequence='AAA')
    read2 = FakeFastaRead(name='seq 2::', sequence='AAA')

    assert check_is_pair(read1, read2)
Esempio n. 27
0
def main():
    info('interleave-reads.py')
    args = get_parser().parse_args()

    for _ in args.infiles:
        check_file_status(_, args.force)

    check_space(args.infiles, args.force)

    s1_file = args.infiles[0]
    if len(args.infiles) == 2:
        s2_file = args.infiles[1]
    else:
        s2_file = s1_file.replace('_R1_', '_R2_')
        if s1_file == s2_file:
            print >>sys.stderr, ("ERROR: given only one filename, that "
                                 "doesn't contain _R1_. Exiting.")
            sys.exit(1)

        print >> sys.stderr, ("given only one file; "
                              "guessing that R2 file is %s" % s2_file)

    fail = False
    if not os.path.exists(s1_file):
        print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file
        fail = True

    if not os.path.exists(s2_file):
        print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file
        fail = True

    if fail and not args.force:
        sys.exit(1)

    print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file)

    counter = 0
    screed_iter_1 = screed.open(s1_file, parse_description=False)
    screed_iter_2 = screed.open(s2_file, parse_description=False)
    for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print >>sys.stderr, ("ERROR: Input files contain different number"
                                 " of records.")
            sys.exit(1)

        if counter % 100000 == 0:
            print >> sys.stderr, '...', counter, 'pairs'
        counter += 1

        name1 = read1.name
        if not check_is_left(name1):
            name1 += '/1'
        name2 = read2.name
        if not check_is_right(name2):
            name2 += '/2'

        read1.name = name1
        read2.name = name2

        if not check_is_pair(read1, read2):
            print >>sys.stderr, "ERROR: This doesn't look like paired data! " \
                "%s %s" % (read1.name, read2.name)
            sys.exit(1)

        write_record_pair(read1, read2, args.output)

    print >> sys.stderr, 'final: interleaved %d pairs' % counter
    print >> sys.stderr, 'output written to', args.output.name
Esempio n. 28
0
def test_check_is_pair_7():
    read1 = screed.Record(name='seq/2', sequence='AAA')
    read2 = screed.Record(name='seq/1', sequence='AAA')

    assert not check_is_pair(read1, read2)
Esempio n. 29
0
def test_check_is_pair_3_broken_fq_2():
    read1 = FakeFQRead(name='seq 1::', quality='###', sequence='AAA')
    read2 = FakeFQRead(name='seq', quality='###', sequence='AAA')

    assert not check_is_pair(read1, read2)
Esempio n. 30
0
def normalize_by_median(input_filename, outfp, htable, args, report_fp=None):

    desired_coverage = args.cutoff
    ksize = htable.ksize()

    # In paired mode we read two records at a time
    batch_size = 1
    if args.paired:
        batch_size = 2

    index = -1
    total = 0
    discarded = 0
    for index, batch in enumerate(
            batchwise(screed.open(input_filename, parse_description=False),
                      batch_size)):
        if index > 0 and index % 100000 == 0:
            print >>sys.stderr, '... kept {kept} of {total} or'\
                ' {perc:2}%'.format(kept=total - discarded, total=total,
                                    perc=int(100. - discarded /
                                             float(total) * 100.))
            print >> sys.stderr, '... in file', input_filename

            if report_fp:
                print >> report_fp, total, total - discarded, \
                    1. - (discarded / float(total))
                report_fp.flush()

        total += batch_size

        # If in paired mode, check that the reads are properly interleaved

        if args.paired:
            if not check_is_pair(batch[0], batch[1]):
                raise IOError('Error: Improperly interleaved pairs \
                    {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name))

        # Emit the batch of reads if any read passes the filter
        # and all reads are longer than K
        passed_filter = False
        passed_length = True
        for record in batch:
            if len(record.sequence) < ksize:
                passed_length = False
                continue

            seq = record.sequence.replace('N', 'A')
            med, _, _ = htable.get_median_count(seq)

            if med < desired_coverage:
                htable.consume(seq)
                passed_filter = True

        # Emit records if any passed
        if passed_length and passed_filter:
            for record in batch:
                write_record(record, outfp)
        else:
            discarded += batch_size

    if report_fp:
        print >> report_fp, total, total - discarded, \
            1. - (discarded / float(total))
        report_fp.flush()

    return total, discarded
Esempio n. 31
0
def main():
    info('interleave-reads.py')
    args = get_parser().parse_args()

    for _ in args.infiles:
        check_file_status(_, args.force)

    check_space(args.infiles, args.force)

    s1_file = args.infiles[0]
    if len(args.infiles) == 2:
        s2_file = args.infiles[1]
    else:
        s2_file = s1_file.replace('_R1_', '_R2_')
        if s1_file == s2_file:
            print >> sys.stderr, ("ERROR: given only one filename, that "
                                  "doesn't contain _R1_. Exiting.")
            sys.exit(1)

        print >> sys.stderr, ("given only one file; "
                              "guessing that R2 file is %s" % s2_file)

    fail = False
    if not os.path.exists(s1_file):
        print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file
        fail = True

    if not os.path.exists(s2_file):
        print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file
        fail = True

    if fail and not args.force:
        sys.exit(1)

    print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file)

    counter = 0
    screed_iter_1 = screed.open(s1_file, parse_description=False)
    screed_iter_2 = screed.open(s2_file, parse_description=False)
    for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print >> sys.stderr, ("ERROR: Input files contain different number"
                                  " of records.")
            sys.exit(1)

        if counter % 100000 == 0:
            print >> sys.stderr, '...', counter, 'pairs'
        counter += 1

        name1 = read1.name
        if not check_is_left(name1):
            name1 += '/1'
        name2 = read2.name
        if not check_is_right(name2):
            name2 += '/2'

        read1.name = name1
        read2.name = name2

        if not check_is_pair(read1, read2):
            print >>sys.stderr, "ERROR: This doesn't look like paired data! " \
                "%s %s" % (read1.name, read2.name)
            sys.exit(1)

        write_record_pair(read1, read2, args.output)

    print >> sys.stderr, 'final: interleaved %d pairs' % counter
    print >> sys.stderr, 'output written to', args.output.name
Esempio n. 32
0
def test_check_is_pair_3_fa():
    read1 = screed.Record(name='seq 1::', sequence='AAA')
    read2 = screed.Record(name='seq 2::', sequence='AAA')

    assert check_is_pair(read1, read2)
Esempio n. 33
0
def normalize_by_median(input_filename, outfp, htable, paired, cutoff,
                        report_fp=None):

    desired_coverage = cutoff
    ksize = htable.ksize()

    # In paired mode we read two records at a time
    batch_size = 1
    if paired:
        batch_size = 2

    index = -1
    total = 0
    discarded = 0
    for index, batch in enumerate(batchwise(screed.open(
            input_filename, parse_description=False), batch_size)):
        if index > 0 and index % 100000 == 0:
            print >>sys.stderr, '... kept {kept} of {total} or'\
                ' {perc:2}%'.format(kept=total - discarded, total=total,
                                    perc=int(100. - discarded /
                                             float(total) * 100.))
            print >>sys.stderr, '... in file', input_filename

            if report_fp:
                print >> report_fp, total, total - discarded, \
                    1. - (discarded / float(total))
                report_fp.flush()

        total += batch_size

        # If in paired mode, check that the reads are properly interleaved

        if paired:
            if not check_is_pair(batch[0], batch[1]):
                raise IOError('Error: Improperly interleaved pairs \
                    {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name))

        # Emit the batch of reads if any read passes the filter
        # and all reads are longer than K
        passed_filter = False
        passed_length = True
        for record in batch:
            if len(record.sequence) < ksize:
                passed_length = False
                continue

            seq = record.sequence.replace('N', 'A')
            med, _, _ = htable.get_median_count(seq)

            if med < desired_coverage:
                htable.consume(seq)
                passed_filter = True

        # Emit records if any passed
        if passed_length and passed_filter:
            for record in batch:
                write_record(record, outfp)
        else:
            discarded += batch_size

    if report_fp:
        print >> report_fp, total, total - discarded, \
            1. - (discarded / float(total))
        report_fp.flush()

    return total, discarded
Esempio n. 34
0
def test_check_is_pair_2():
    read1 = FakeFQRead(name="seq/1", quality="###", sequence="AAA")
    read2 = FakeFQRead(name="seq/2", quality="###", sequence="AAA")

    assert check_is_pair(read1, read2)