Exemple #1
0
 def test_paired(self):
     """Check FASTQ parsing matches FASTA+QUAL parsing"""
     with open("Quality/example.fasta") as f:
         with open("Quality/example.qual") as q:
             records1 = list(QualityIO.PairedFastaQualIterator(f, q))
     records2 = list(SeqIO.parse("Quality/example.fastq", "fastq"))
     self.assertTrue(compare_records(records1, records2))
 def test_paired(self):
     """Check FASTQ parsing matches FASTA+QUAL parsing"""
     records1 = list(\
         QualityIO.PairedFastaQualIterator(open("Quality/example.fasta"),
                                           open("Quality/example.qual")))
     records2 = list(SeqIO.parse(open("Quality/example.fastq"), "fastq"))
     self.assert_(compare_records(records1, records2))
def not_trimmed(cur, conf, options, sequence, qual):
    cur.execute('SELECT name FROM sequence WHERE cluster = %s',
                (options.species))
    data = cur.fetchall()
    dataset = set()
    for d in data:
        dataset.add(d[0])
    seqs = QualityIO.PairedFastaQualIterator(
        open(conf.get('Input', 'sequence'), "rU"),
        open(conf.get('Input', 'qual'), "rU"))
    try:
        while seqs:
            record = seqs.next()
            if record.name in dataset:
                sequence.write('%s' % record.format('fasta'))
                qual.write('%s' % record.format('qual'))
    except StopIteration:
        pass
    qual.close()
    sequence.close()
Exemple #4
0
def main():
    '''Main loop'''
    start_time = time.time()
    options, arg = interface()
    motd()
    print 'Started: ', time.strftime("%a %b %d, %Y  %H:%M:%S",
                                     time.localtime(start_time))
    conf = ConfigParser.ConfigParser()
    conf.read(options.conf)
    # build our configuration
    params = Parameters(conf)
    conn = MySQLdb.connect(user=params.user, passwd=params.pwd, db=params.db)
    cur = conn.cursor()
    # crank out a new table for the data
    createSeqTable(cur)
    conn.commit()
    seqcount = sequenceCount(conf.get('Input', 'sequence'))
    sequence = QualityIO.PairedFastaQualIterator(
        open(conf.get('Input', 'sequence'), "rU"),
        open(conf.get('Input', 'qual'), "rU"))
    #pdb.set_trace()
    if conf.getboolean('Multiprocessing', 'MULTIPROCESSING'):
        # get num processors
        n_procs = conf.get('Multiprocessing', 'processors')
        if n_procs == 'Auto':
            # we'll use x-1 cores (where x = avail. cores)
            n_procs = multiprocessing.cpu_count() - 1
        else:
            n_procs = int(n_procs)
        print 'Multiprocessing.  Number of processors = ', n_procs
        # to test with fewer sequences
        #count = 0
        try:
            threads = []
            pb = progress.bar(0, seqcount, 60)
            pb_inc = 0
            while sequence:
                if len(threads) < n_procs:
                    p = multiprocessing.Process(target=linkerWorker,
                                                args=(
                                                    sequence.next(),
                                                    params,
                                                ))
                    p.start()
                    threads.append(p)
                    if (pb_inc + 1) % 1000 == 0:
                        pb.__call__(pb_inc + 1)
                    elif pb_inc + 1 == seqcount:
                        pb.__call__(pb_inc + 1)
                    pb_inc += 1
                else:
                    for t in threads:
                        if not t.is_alive():
                            threads.remove(t)
        except StopIteration:
            pass
    else:
        print 'Not using multiprocessing'
        count = 0
        try:
            pb = progress.bar(0, seqcount, 60)
            pb_inc = 0
            #while count < 1000:
            while sequence:
                #count +=1
                linkerWorker(sequence.next(), params)
                if (pb_inc + 1) % 1000 == 0:
                    pb.__call__(pb_inc + 1)
                elif pb_inc + 1 == seqcount:
                    pb.__call__(pb_inc + 1)
                pb_inc += 1
        except StopIteration:
            pass
    print '\n'
    cur.close()
    conn.close()
    end_time = time.time()
    print 'Ended: ', time.strftime("%a %b %d, %Y  %H:%M:%S",
                                   time.localtime(end_time))
    print '\nTime for execution: ', (end_time - start_time) / 60, 'minutes'
Exemple #5
0
def action(arguments):
    """
    Given parsed arguments, filter input files.
    """
    if arguments.quality_window_mean_qual and not arguments.quality_window:
        raise ValueError("--quality-window-mean-qual specified without "
                         "--quality-window")

    if trie is None or triefind is None:
        raise ValueError(
            'Missing Bio.trie and/or Bio.triefind modules. Cannot continue')

    filters = []
    input_type = fileformat.from_handle(arguments.sequence_file)
    output_type = fileformat.from_handle(arguments.output_file)
    with arguments.sequence_file as fp:
        if arguments.input_qual:
            sequences = QualityIO.PairedFastaQualIterator(
                fp, arguments.input_qual)
        else:
            sequences = SeqIO.parse(fp, input_type)

        listener = RecordEventListener()
        if arguments.details_out:
            rh = RecordReportHandler(arguments.details_out, arguments.argv,
                                     arguments.details_comment)
            rh.register_with(listener)

        # Track read sequences
        sequences = listener.iterable_hook('read', sequences)

        # Add filters
        if arguments.min_mean_quality and input_type == 'fastq':
            qfilter = QualityScoreFilter(arguments.min_mean_quality)
            filters.append(qfilter)
        if arguments.max_length:
            max_length_filter = MaxLengthFilter(arguments.max_length)
            filters.append(max_length_filter)
        if arguments.min_length:
            min_length_filter = MinLengthFilter(arguments.min_length)
            filters.append(min_length_filter)
        if arguments.max_ambiguous is not None:
            max_ambig_filter = MaxAmbiguousFilter(arguments.max_ambiguous)
            filters.append(max_ambig_filter)
        if arguments.pct_ambiguous is not None:
            pct_ambig_filter = PctAmbiguousFilter(arguments.pct_ambiguous)
            filters.append(pct_ambig_filter)
        if arguments.ambiguous_action:
            ambiguous_filter = AmbiguousBaseFilter(arguments.ambiguous_action)
            filters.append(ambiguous_filter)
        if arguments.quality_window:
            min_qual = (arguments.quality_window_mean_qual or
                        arguments.min_mean_quality)
            window_filter = WindowQualityScoreFilter(arguments.quality_window,
                                                     min_qual)
            filters.insert(0, window_filter)

        if arguments.barcode_file:
            with arguments.barcode_file:
                tr = parse_barcode_file(arguments.barcode_file,
                                        arguments.primer,
                                        arguments.barcode_header)
            f = PrimerBarcodeFilter(tr)
            filters.append(f)

            if arguments.map_out:
                barcode_writer = csv.writer(
                    arguments.map_out,
                    quoting=getattr(csv, arguments.quoting),
                    lineterminator='\n')

                def barcode_handler(record, sample, barcode=None):
                    barcode_writer.writerow((record.id, sample))

                listener.register_handler('found_barcode', barcode_handler)
        for f in filters:
            f.listener = listener
            sequences = f.filter_records(sequences)

        # Track sequences which passed all filters
        sequences = listener.iterable_hook('write', sequences)

        with arguments.output_file:
            SeqIO.write(sequences, arguments.output_file, output_type)

    rpt_rows = (f.report_dict() for f in filters)

    # Write report
    with arguments.report_out as fp:
        writer = csv.DictWriter(
            fp, BaseFilter.report_fields, lineterminator='\n', delimiter='\t')
        writer.writeheader()
        writer.writerows(rpt_rows)
    seq, tag = str(record.seq), str(record.seq)
    seq_match, tag_match, score, start, end = pairwise2.align.localms(
        seq, tag, 5.0, -4.0, -9.0, -0.5, one_alignment_only=True)[0]
    #name = multiprocessing.current_process().name
    #print 'Worker', name, str(record.seq)
    print "Parent: ", os.getppid(), "Child: ", os.getpid(), "Count: ", count
    return


if __name__ == '__main__':
    start_time = time.time()
    conf = ConfigParser.ConfigParser()
    conf.read('mc454.conf')
    #jobs = []
    record = QualityIO.PairedFastaQualIterator(
        open(conf.get('Input', 'sequence'), "rU"),
        open(conf.get('Input', 'qual'), "rU"))
    mproc = True
    if mproc == True:
        count = 0
        try:
            while count < 500:
                #pdb.set_trace()
                jobs = []
                for i in range(multiprocessing.cpu_count()):
                    count += 1
                    p = multiprocessing.Process(target=worker,
                                                args=(record.next(), count))
                    jobs.append(p)
                    p.start()
                #p.join()