Esempio n. 1
0
def get_kmer_index(barcodes_unzipped):
    """
	Args:
		barcodes_unzipped (str): filename for unzipped barcodes fq
	
	Returns
		kmer_idx (dict): map of kmer to list of line offsets for reads 
			that contain that kmer
		kmer_counts (dict): map of kmer to absolute counts
	
	This method returns a kmer index and counts dict for a random
	subset of the dataset. The size of the subset attempts to be the
	minimal number of reads whose kmer spectrum is representative
	of the data
	
	General approach:
		initialize:
			get a random chunk of reads based on line offsets
			compute kmer counts
		loop:
			get a new chunk of reads and combine with prevoius chunks
			compute kmer counts for the new chunk
			compare kmer counts with previous iteration
		terminate when:
			pearsonR >= some cutoff value
	
	"""
    PEARSONR_CUTOFF = 0.999
    MIN_ITERS = 10
    BUFFER_SIZE = 10000

    length = args['barcode_end'] - args['barcode_start']
    pool = Pool(processes=args['threads'])

    read_count = 0
    kmer_idx = {}
    counts_corr_coefs = []
    num_reads = []

    bc_file = open(barcodes_unzipped, 'rb')
    read_chunks_iter = IO_utils.get_read_chunks(bc_file,
                                                random=True,
                                                BUFFER_SIZE=BUFFER_SIZE)
    chunk_num = 0
    while True:
        try:
            reads_chunk = next(read_chunks_iter)
            chunk_num += 1
        except StopIteration:
            break

        read_count += len(reads_chunk)
        num_reads.append(read_count)
        chunk_kmer_indices = pool.map(index_read, reads_chunk)
        #chunk_kmer_indices is a list of dicts
        old_kmer_counts = get_kmer_counts(kmer_idx)
        #kmer counts before updating with chunk_kmer_indexes

        for element in chunk_kmer_indices:
            for (key, read_offsets) in element.items():
                #read_offsets: [offset1, offset2, offset3 ...]
                if key not in kmer_idx:
                    kmer_idx[key] = []
                kmer_idx[key] = kmer_idx[key] + read_offsets

        del (chunk_kmer_indices)
        _ = gc.collect()

        new_kmer_counts = get_kmer_counts(kmer_idx)
        #check kmer count correlation
        counts_corr_coef = get_kmer_count_correlation(old_kmer_counts,
                                                      new_kmer_counts)
        counts_corr_coefs.append(counts_corr_coef)
        print('\t%i reads indexed. Running pearsonr is %f' % \
         (read_count, counts_corr_coef))

        if(len(counts_corr_coefs) >= MIN_ITERS) and \
         (counts_corr_coef > PEARSONR_CUTOFF):
            break

    bc_file.close()
    pool.close()

    return (kmer_idx, new_kmer_counts,
            Plot_utils.plot_kmer_subsamp_pearson(output_dir, counts_corr_coefs,
                                                 num_reads))
Esempio n. 2
0
def assign_all_reads(params):
    (consensus_bcs, reads_unzipped, barcodes_unzipped) = params

    BUFFER_SIZE = 100000
    MAX_KMER_SIZE = args['barcode_end'] - args['barcode_start']
    MIN_KMER_SIZE = 6

    reads_assigned_db, reads_assigned_pipe = IO_utils.initialize_redis_pipeline(
    )
    pool = Pool(processes=args['threads'])

    #print('\tMapping kmers to consensus barcodes')
    if args['split_levenshtein']:
        print(
            '\tAssigning reads to consensus barcodes using Levenshtein distance'
        )
    else:
        print(
            '\tAssigning reads to consensus barcodes using kmer compatability')
        kmer_map = map_kmers_to_bcs(consensus_bcs, MIN_KMER_SIZE,
                                    MAX_KMER_SIZE)

    read_count = 0
    num_unassigned = 0
    reads_f = open(reads_unzipped, 'rb')
    barcodes_f = open(barcodes_unzipped, 'rb')

    encode = lambda i: str(i).encode('utf-8')
    encode_tup = lambda i, j: encode(i) + b',' + encode(j)

    for reads_chunk, barcodes_chunk in zip(
            IO_utils.get_read_chunks(reads_f,
                                     random=False,
                                     BUFFER_SIZE=BUFFER_SIZE),
            IO_utils.get_read_chunks(barcodes_f,
                                     random=False,
                                     BUFFER_SIZE=BUFFER_SIZE)):
        read_count += len(reads_chunk)

        if args['split_levenshtein']:
            assignments = pool.map(
                assign_read_levenshtein,
                zip(repeat(args), repeat(consensus_bcs), reads_chunk,
                    barcodes_chunk))

        else:
            assignments = pool.map(
                assign_read_kmers,
                zip(repeat(kmer_map), repeat(MIN_KMER_SIZE),
                    repeat(MAX_KMER_SIZE), reads_chunk, barcodes_chunk))

        for (assignment, offset1, offset2) in assignments:
            if (assignment == 'unassigned'):
                num_unassigned += 1
            #reads_assigned[assignment].append((offset1, offset2))
            reads_assigned_pipe.rpush(assignment.encode('utf-8'),
                                      encode_tup(offset1, offset2))

        reads_assigned_pipe.execute()
        print('\tProcessed %i reads' % read_count)

    reads_f.close()
    barcodes_f.close()
    pool.close()

    print('\t%i reads could not be assigned' % num_unassigned)
    #return pickle_files
    return reads_assigned_db, reads_assigned_pipe
Esempio n. 3
0
def assign_all_reads(params):
    (consensus_bcs, reads_unzipped, barcodes_unzipped) = params

    BUFFER_SIZE = 10000
    PICKLE_SIZE = 1000000
    MAX_KMER_SIZE = args['barcode_end'] - args['barcode_start']
    MIN_KMER_SIZE = 7

    pool = Pool(processes=args['threads'])

    print('\tMapping kmers to consensus barcodes')
    kmer_map = map_kmers_to_bcs(consensus_bcs, MIN_KMER_SIZE, MAX_KMER_SIZE)
    reads_assigned = initialize_reads_assigned(consensus_bcs)

    print('\tAssigning reads to consensus barcodes')
    read_count = 0
    num_unassigned = 0
    reads_f = open(reads_unzipped, 'rb')
    barcodes_f = open(barcodes_unzipped, 'rb')
    pickle_files = []

    for reads_chunk, barcodes_chunk in zip(
            IO_utils.get_read_chunks(reads_f,
                                     random=False,
                                     BUFFER_SIZE=BUFFER_SIZE),
            IO_utils.get_read_chunks(barcodes_f,
                                     random=False,
                                     BUFFER_SIZE=BUFFER_SIZE)):
        read_count += len(reads_chunk)

        if not args['split_levenshtein']:
            assignments = pool.map(
                assign_read_kmers,
                zip(repeat(kmer_map), repeat(MIN_KMER_SIZE),
                    repeat(MAX_KMER_SIZE), reads_chunk, barcodes_chunk))
        else:
            #this is a pipeline for reviwer expts only
            #works quite poorly, see simulation results
            assignments = pool.map(
                assign_read_levenshtein,
                zip(repeat(consensus_bcs), reads_chunk, barcodes_chunk))

        for (assignment, offset1, offset2) in assignments:
            if (assignment == 'unassigned'):
                num_unassigned += 1
            reads_assigned[assignment].append((offset1, offset2))
        print('\tProcessed %i reads' % read_count)

        #pickle dump read assignments every 10m reads
        if read_count % PICKLE_SIZE == 0:
            pickle_files.append(IO_utils.write_to_pickle(reads_assigned))
            reads_assigned = initialize_reads_assigned(consensus_bcs)

    pickle_files.append(IO_utils.write_to_pickle(reads_assigned))

    reads_f.close()
    barcodes_f.close()
    pool.close()

    print('\t%i reads could not be assigned' % num_unassigned)
    return pickle_files