コード例 #1
0
ファイル: Split_reads.py プロジェクト: modash/sircel
def build_subgraph(reads_in_subgraph, barcodes_unzipped):
	bc_file = open(barcodes_unzipped, 'rb')
	barcodes_iter = IO_utils.read_fastq_random(
		bc_file, offsets = reads_in_subgraph)
	subgraph_kmer_counts = Counter()
	while(True):
		try:
			barcode_data, _ = next(barcodes_iter)
		except StopIteration:
			break	
		read_kmers = IO_utils.get_cyclic_kmers(
			barcode_data, 
			int(args['kmer_size']),
			int(args['barcode_start']), 
			int(args['barcode_end']))		
		for (kmer, _ ) in read_kmers:
			subgraph_kmer_counts[kmer] += 1
	bc_file.close()
	
	edges = []
	for(kmer, count) in subgraph_kmer_counts.items():
		edge = Edge(kmer[0:-1], kmer[1:], count)
		edges.append(edge)
	subgraph = Graph(edges)
	return subgraph
コード例 #2
0
def run_all(cmdline_args):
    print('Splitting reads by barcodes')

    global args
    global output_files
    global output_dir

    args = cmdline_args
    output_dir = args['output_dir']
    output_files['log'] = '%s/run_log.txt' % output_dir

    Logger.start(output_files['log'])
    start_time = time.time()

    reads_unzipped = args['reads']
    barcodes_unzipped = args['barcodes']
    print('Building kmer index')
    kmer_index, kmer_counts, subsamp_pearson = get_kmer_index(
        barcodes_unzipped)
    output_files['subsamp_pearson_plot'] = subsamp_pearson
    print('\t%i unique kmers indexed' % len(kmer_counts.items()))

    print('Finding cyclic paths in the barcode de Briujn graph')
    cyclic_paths = find_paths((kmer_index, kmer_counts, barcodes_unzipped,
                               reads_unzipped, output_dir))
    print('\t%i cyclic paths found' % len(cyclic_paths))
    output_files['all_paths'] = IO_utils.save_paths_text(output_dir,
                                                         cyclic_paths,
                                                         prefix='all')

    print('Thresholding paths')
    (top_paths, fit_out) = threshold_paths(output_dir, cyclic_paths,
                                           args['num_cells'])
    output_files.update(fit_out)
    consensus_bcs = set([tup[0] for tup in top_paths])

    print('Assigning reads by kmer compatability')
    reads_assigned_pickled = assign_all_reads(
        (consensus_bcs, reads_unzipped, barcodes_unzipped))

    print('Splitting reads by cell')
    output_files['split'], reads_per_cell = write_split_fastqs(
        (consensus_bcs, reads_assigned_pickled, output_dir, reads_unzipped,
         barcodes_unzipped))

    #delete temp pickle files
    for pickled in reads_assigned_pickled:
        os.unlink(pickled)

    #update paths list
    top_paths = update_paths_list(top_paths, reads_per_cell)
    output_files['thresholded_paths'] = IO_utils.save_paths_text(
        output_dir, top_paths, prefix='threshold')

    current_time = time.time()
    elapsed_time = current_time - start_time

    Logger.stop()
    return (output_files, elapsed_time)
コード例 #3
0
def find_paths(params, starting_kmers=None):
    (kmer_index, kmer_counts, barcodes_unzipped, reads_unzipped,
     output_dir) = params
    barcode_length = args['barcode_end'] - args['barcode_start']
    kmers_sorted = [
        tup[0] for tup in sorted(
            list(kmer_counts.items()), key=lambda tup: tup[1], reverse=True)
    ]

    if (starting_kmers == None):
        starting_kmers = []
        for kmer in kmers_sorted:
            if (kmer[0] == '$'):
                starting_kmers.append((kmer, kmer_index[kmer]))
            if (len(starting_kmers) >= args['breadth']):
                break
    else:
        starting_kmers_tmp = []
        for kmer in starting_kmers:
            starting_kmers_tmp.append(kmer, kmer_index[kmer])
        starting_kmers = starting_kmers_tmp

    pool = Pool(processes=args['threads'])
    paths = []
    for group in IO_utils.grouper(starting_kmers, args['threads']):

        kmers_group = [tup[0] for tup in group]
        offsets_group = [tup[1] for tup in group]
        paths_group = pool.map(
            find_path_from_kmer,
            zip(kmers_group, offsets_group, repeat(barcodes_unzipped),
                repeat(barcode_length)))
        paths += [item for sublist in paths_group for item in sublist]
    pool.close()
    return paths
コード例 #4
0
ファイル: Split_reads.py プロジェクト: modash/sircel
def assign_read_kmers(params):
	"""
	Assigns a single read to a cell barcode by kmer compatibility
	args (tuple)
		kmers_to_paths: dict of kmer -> list of paths that contain it
		min_kmer_size
		max_kmer_size
		read: list of fastq entry lines
	"""
	(kmer_map,
		min_kmer_size,
		max_kmer_size,
		(reads_data, reads_offset),
		(barcodes_data, barcodes_offset)) = params
		
	for kmer_size in range(max_kmer_size, min_kmer_size, -1):
		read_kmers = IO_utils.get_cyclic_kmers(
			barcodes_data, 
			kmer_size,
			args['barcode_start'], 
			args['barcode_end'],
			indel = True)
		bcs, is_assigned, is_unique = get_most_common_bc(
			kmer_map, read_kmers)
		if is_assigned and is_unique:
			return (bcs[0], reads_offset, barcodes_offset)
		#outherwise decrement kmer size and try again
	return ('unassigned', reads_offset, barcodes_offset)
コード例 #5
0
def get_num_unassigned(simulation_output_dir):
    fq_fname = '%s/reads_split/cell_unassigned_barcodes.fastq.gz' % \
     (simulation_output_dir)

    num_unassigned = 0
    fq_file = gzip.open(fq_fname, 'rb')
    fq_iter = IO_utils.read_fastq_sequential(fq_file)
    for (lines, _) in fq_iter:
        num_unassigned += 1
    return num_unassigned
コード例 #6
0
def map_kmers_to_bcs_fixed_k(consensus_bcs, kmer_size):
    kmers_to_paths = {}
    for cell_barcode in consensus_bcs:
        kmers = IO_utils.get_cyclic_kmers(
            ['na', cell_barcode, 'na', cell_barcode],
            kmer_size,
            0,
            len(cell_barcode),
            indel=True)
        for (kmer, _) in kmers:
            if (kmer not in kmers_to_paths.keys()):
                kmers_to_paths[kmer] = []
            kmers_to_paths[kmer].append(cell_barcode)
    return kmers_to_paths
コード例 #7
0
def index_read(params):
    """
	Args
		params (tuple):
			barcodes_data (str): sequence of read_1 (barcode)
			barcodes_offset (int): line offset for this read
	Returns
		kmer_index (dict): 
	"""
    (barcodes_data, barcodes_offset) = params

    kmer_index = {}
    read_kmers = IO_utils.get_cyclic_kmers(barcodes_data, args['kmer_size'],
                                           args['barcode_start'],
                                           args['barcode_end'])
    for (kmer, _) in read_kmers:
        if (kmer not in kmer_index.keys()):
            kmer_index[kmer] = []
        kmer_index[kmer].append(barcodes_offset)
    return kmer_index
コード例 #8
0
def get_fraction_consistent(pred_bc, simulation_output_dir):
    fq_fname = '%s/reads_split/cell_%s_barcodes.fastq.gz' % \
     (simulation_output_dir, pred_bc)
    if not os.path.exists(fq_fname):
        return 0

    fq_file = gzip.open(fq_fname, 'rb')
    fq_iter = IO_utils.read_fastq_sequential(fq_file)

    assignments = Counter()
    total_reads = 0
    for (lines, _) in fq_iter:
        read_name = lines[0]
        assigned_bc = read_name.split(':')[-2].split('_')[0]
        assignments.update([assigned_bc])
        total_reads += 1
    try:
        common_bc, count = assignments.most_common()[0]
        return count / total_reads
    except IndexError:
        return 0
コード例 #9
0
def get_fraction_correct_reads(pred_bc, simulation_output_dir):
    fq_fname = '%s/reads_split/cell_%s_barcodes.fastq.gz' % \
     (simulation_output_dir, pred_bc)
    if not os.path.exists(fq_fname):
        return (0, 0)
    fq_file = gzip.open(fq_fname, 'rb')
    fq_iter = IO_utils.read_fastq_sequential(fq_file)

    tpr = 0.
    fpr = 0.
    for (lines, _) in fq_iter:
        read_name = lines[0]
        assigned_bc = read_name.split(':')[-1]
        true_bc = read_name.split(':')[-2].split('_')[0]

        if (assigned_bc == true_bc):
            tpr += 1.
        else:
            fpr += 1.
    total_reads = tpr + fpr
    tpr /= total_reads
    fpr /= total_reads

    return (tpr, fpr)
コード例 #10
0
ファイル: Sircel_master.py プロジェクト: wxb263stu/sircel
def run_all(args):
    print('\nInspecting and pre-processing inputs')

    if (args['output_dir'][-1] == '/'):
        args['output_dir'] = args['output_dir'][0:-1]
    if not os.path.exists(args['output_dir']):
        os.makedirs(args['output_dir'])
    if not os.path.exists(args['output_dir'] + '/plots'):
        os.makedirs(args['output_dir'] + '/plots')
    with (Path(__file__).parent / 'params.json').open() as r:
        kallisto = json.load(r)['kallisto']
    assert kallisto

    split_args = {}
    check_pipeline_input(args, kallisto)

    if args['dropseq']:
        args['barcode_start'] = 0
        args['barcode_end'] = 12
        args['umi_start'] = 12
        args['umi_end'] = 20
        if args['kmer_size'] == None:
            args['kmer_size'] = 8
        print('Unzipping files (temporary)')
        reads_unzipped = \
         IO_utils.unzip(args['reads'].split(','))
        barcodes_unzipped = \
         IO_utils.unzip(args['barcodes'].split(','))
        args['reads'] = reads_unzipped
        args['barcodes'] = barcodes_unzipped
    else:
        if args['kmer_size'] == None:
            args['kmer_size'] = 8
        print('Unzipping (temporary)')
        reads_unzipped = \
         IO_utils.unzip(args['reads'].split(','))
        barcodes_unzipped = \
         IO_utils.unzip(args['barcodes'].split(','))
        args['reads'] = reads_unzipped
        args['barcodes'] = barcodes_unzipped
    """
	elif args['10xgenomics']:
		args['barcode_start']	= 0
		args['barcode_end'] 		= 26
		args['umi_start'] 		= 26
		args['umi_end'] 			= 34
		if args['kmer_size'] == None:
			args['kmer_size'] = 20	
		print('Unzipping and merging files (temporary)')
		reads_unzipped = \
			IO_utils.unzip(args['reads'].split(','))
		barcodes_unzipped = IO_utils.merge_barcodefiles_10x(
			args['barcodes'].split(','),
			args['umis'].split(','))
				
		args['reads'] = reads_unzipped
		args['barcodes'] = barcodes_unzipped	
	"""

    check_split_input(args)
    output_files, elapsed_time = Split_reads.run_all(args)
    output_files['args'] = args
    print('Done idetifying barcodes and splitting reads.\n' + \
     '\tTime elapsed: %0.002f seconds\n' % elapsed_time)

    #print(args['kallisto_idx'])
    if (args['kallisto_idx'] != None):
        print('Running kallisto')
        kallisto_dir = '%s/kallisto_outputs' % args['output_dir']
        if not os.path.exists(kallisto_dir):
            os.makedirs(kallisto_dir)
        output_files['kallisto'] = run_kallisto(args, kallisto, kallisto_dir,
                                                output_files)
        print('Getting transcript compatibility counts')
        output_files['tcc'] = write_transcript_compatability_counts(
            args, output_files, kallisto_dir)

    print('Removing temp files')
    os.unlink(reads_unzipped)
    os.unlink(barcodes_unzipped)
    output_files['run_outputs'] = \
     '%s/run_outputs.json' % args['output_dir']
    with open(output_files['run_outputs'], 'w') as writer:
        writer.write(json.dumps(output_files, indent=3))

    print('Done.')
    return output_files
コード例 #11
0
def write_split_fastqs(params):
    import gzip
    (consensus_bcs, reads_assigned_db, reads_assigned_pipe, output_dir,
     reads_unzipped, barcodes_unzipped) = params

    split_dir = '%s/reads_split' % output_dir
    if not os.path.exists(split_dir):
        os.makedirs(split_dir)
    output_files = {'batch': '%s/batch.txt' % (split_dir)}
    batch_file = open(output_files['batch'], 'w')

    reads_per_cell = {}
    consensus_bcs.add('unassigned')

    for cell in consensus_bcs:

        try:
            cell_offsets = IO_utils.get_from_db(reads_assigned_pipe, [cell])
        except IndexError:
            pass

        #cell_offsets = IO_utils.read_from_pickle(reads_assigned_pickled, cell)
        cell_name = 'cell_%s' % cell

        #initialie all readers and writers
        output_files[cell_name] = {
            'reads': '%s/%s_reads.fastq.gz' % (split_dir, cell_name),
            'barcodes': '%s/%s_barcodes.fastq.gz' % (split_dir, cell_name),
            'umi': '%s/%s.umi.txt' % (split_dir, cell_name)
        }
        batch_file.write('%s\t%s\t%s\n' % \
         (cell_name,
         output_files[cell_name]['umi'],
         output_files[cell_name]['reads']))
        reads_writer = gzip.open(output_files[cell_name]['reads'], 'wb')
        barcodes_writer = gzip.open(output_files[cell_name]['barcodes'], 'wb')
        umi_writer = open(output_files[cell_name]['umi'], 'wb')
        reads_f = open(reads_unzipped, 'rb')
        barcodes_f = open(barcodes_unzipped, 'rb')

        reads_iter = IO_utils.read_fastq_random(
            reads_f,
            offsets=[
                cell_offsets[i] for i in range(len(cell_offsets)) if i % 2 == 0
            ])
        barcodes_iter = IO_utils.read_fastq_random(
            barcodes_f,
            offsets=[
                cell_offsets[i] for i in range(len(cell_offsets)) if i % 2 == 1
            ])
        reads_in_cell = 0
        while (True):
            try:
                reads_data, _ = next(reads_iter)
                barcodes_data, _ = next(barcodes_iter)
                reads_in_cell += 1
            except StopIteration:
                break

            reads_data[0] += ' %s' % cell_name.replace('_', ':')
            reads_data[0] = reads_data[0].replace(' ', '_')
            barcodes_data[0] += ' %s' % cell_name.replace('_', ':')
            barcodes_data[0] = barcodes_data[0].replace(' ', '_')

            umi = barcodes_data[1][int(args['umi_start']):int(args['umi_end'])]
            reads_writer.write(('\n'.join(reads_data) + '\n').encode('utf-8'))
            barcodes_writer.write(
                ('\n'.join(barcodes_data) + '\n').encode('utf-8'))
            umi_writer.write((umi + '\n').encode('utf-8'))

        reads_writer.close()
        umi_writer.close()
        barcodes_writer.close()
        reads_f.close()
        barcodes_f.close()

        print('\tWrote %i reads to file:\t%s' % \
         (reads_in_cell, cell_name))
        reads_per_cell[cell] = reads_in_cell
    batch_file.close()
    return output_files, reads_per_cell
コード例 #12
0
def assign_all_reads(params):
    (consensus_bcs, reads_unzipped, barcodes_unzipped) = params

    BUFFER_SIZE = 100000
    MAX_KMER_SIZE = args['barcode_end'] - args['barcode_start']
    MIN_KMER_SIZE = 6

    reads_assigned_db, reads_assigned_pipe = IO_utils.initialize_redis_pipeline(
    )
    pool = Pool(processes=args['threads'])

    #print('\tMapping kmers to consensus barcodes')
    if args['split_levenshtein']:
        print(
            '\tAssigning reads to consensus barcodes using Levenshtein distance'
        )
    else:
        print(
            '\tAssigning reads to consensus barcodes using kmer compatability')
        kmer_map = map_kmers_to_bcs(consensus_bcs, MIN_KMER_SIZE,
                                    MAX_KMER_SIZE)

    read_count = 0
    num_unassigned = 0
    reads_f = open(reads_unzipped, 'rb')
    barcodes_f = open(barcodes_unzipped, 'rb')

    encode = lambda i: str(i).encode('utf-8')
    encode_tup = lambda i, j: encode(i) + b',' + encode(j)

    for reads_chunk, barcodes_chunk in zip(
            IO_utils.get_read_chunks(reads_f,
                                     random=False,
                                     BUFFER_SIZE=BUFFER_SIZE),
            IO_utils.get_read_chunks(barcodes_f,
                                     random=False,
                                     BUFFER_SIZE=BUFFER_SIZE)):
        read_count += len(reads_chunk)

        if args['split_levenshtein']:
            assignments = pool.map(
                assign_read_levenshtein,
                zip(repeat(args), repeat(consensus_bcs), reads_chunk,
                    barcodes_chunk))

        else:
            assignments = pool.map(
                assign_read_kmers,
                zip(repeat(kmer_map), repeat(MIN_KMER_SIZE),
                    repeat(MAX_KMER_SIZE), reads_chunk, barcodes_chunk))

        for (assignment, offset1, offset2) in assignments:
            if (assignment == 'unassigned'):
                num_unassigned += 1
            #reads_assigned[assignment].append((offset1, offset2))
            reads_assigned_pipe.rpush(assignment.encode('utf-8'),
                                      encode_tup(offset1, offset2))

        reads_assigned_pipe.execute()
        print('\tProcessed %i reads' % read_count)

    reads_f.close()
    barcodes_f.close()
    pool.close()

    print('\t%i reads could not be assigned' % num_unassigned)
    #return pickle_files
    return reads_assigned_db, reads_assigned_pipe
コード例 #13
0
def get_kmer_index(barcodes_unzipped):
    """
	Args:
		barcodes_unzipped (str): filename for unzipped barcodes fq
	
	Returns
		kmer_idx (dict): map of kmer to list of line offsets for reads 
			that contain that kmer
		kmer_counts (dict): map of kmer to absolute counts
	
	This method returns a kmer index and counts dict for a random
	subset of the dataset. The size of the subset attempts to be the
	minimal number of reads whose kmer spectrum is representative
	of the data
	
	General approach:
		initialize:
			get a random chunk of reads based on line offsets
			compute kmer counts
		loop:
			get a new chunk of reads and combine with prevoius chunks
			compute kmer counts for the new chunk
			compare kmer counts with previous iteration
		terminate when:
			pearsonR >= some cutoff value
	
	"""
    PEARSONR_CUTOFF = 0.999
    MIN_ITERS = 10
    BUFFER_SIZE = 10000

    length = args['barcode_end'] - args['barcode_start']
    pool = Pool(processes=args['threads'])

    read_count = 0
    kmer_idx = {}
    counts_corr_coefs = []
    num_reads = []

    bc_file = open(barcodes_unzipped, 'rb')
    read_chunks_iter = IO_utils.get_read_chunks(bc_file,
                                                random=True,
                                                BUFFER_SIZE=BUFFER_SIZE)
    chunk_num = 0
    while True:
        try:
            reads_chunk = next(read_chunks_iter)
            chunk_num += 1
        except StopIteration:
            break

        read_count += len(reads_chunk)
        num_reads.append(read_count)
        chunk_kmer_indices = pool.map(index_read, reads_chunk)
        #chunk_kmer_indices is a list of dicts
        old_kmer_counts = get_kmer_counts(kmer_idx)
        #kmer counts before updating with chunk_kmer_indexes

        for element in chunk_kmer_indices:
            for (key, read_offsets) in element.items():
                #read_offsets: [offset1, offset2, offset3 ...]
                if key not in kmer_idx:
                    kmer_idx[key] = []
                kmer_idx[key] = kmer_idx[key] + read_offsets

        del (chunk_kmer_indices)
        _ = gc.collect()

        new_kmer_counts = get_kmer_counts(kmer_idx)
        #check kmer count correlation
        counts_corr_coef = get_kmer_count_correlation(old_kmer_counts,
                                                      new_kmer_counts)
        counts_corr_coefs.append(counts_corr_coef)
        print('\t%i reads indexed. Running pearsonr is %f' % \
         (read_count, counts_corr_coef))

        if(len(counts_corr_coefs) >= MIN_ITERS) and \
         (counts_corr_coef > PEARSONR_CUTOFF):
            break

    bc_file.close()
    pool.close()

    return (kmer_idx, new_kmer_counts,
            Plot_utils.plot_kmer_subsamp_pearson(output_dir, counts_corr_coefs,
                                                 num_reads))
コード例 #14
0
def assign_all_reads(params):
    (consensus_bcs, reads_unzipped, barcodes_unzipped) = params

    BUFFER_SIZE = 10000
    PICKLE_SIZE = 1000000
    MAX_KMER_SIZE = args['barcode_end'] - args['barcode_start']
    MIN_KMER_SIZE = 7

    pool = Pool(processes=args['threads'])

    print('\tMapping kmers to consensus barcodes')
    kmer_map = map_kmers_to_bcs(consensus_bcs, MIN_KMER_SIZE, MAX_KMER_SIZE)
    reads_assigned = initialize_reads_assigned(consensus_bcs)

    print('\tAssigning reads to consensus barcodes')
    read_count = 0
    num_unassigned = 0
    reads_f = open(reads_unzipped, 'rb')
    barcodes_f = open(barcodes_unzipped, 'rb')
    pickle_files = []

    for reads_chunk, barcodes_chunk in zip(
            IO_utils.get_read_chunks(reads_f,
                                     random=False,
                                     BUFFER_SIZE=BUFFER_SIZE),
            IO_utils.get_read_chunks(barcodes_f,
                                     random=False,
                                     BUFFER_SIZE=BUFFER_SIZE)):
        read_count += len(reads_chunk)

        if not args['split_levenshtein']:
            assignments = pool.map(
                assign_read_kmers,
                zip(repeat(kmer_map), repeat(MIN_KMER_SIZE),
                    repeat(MAX_KMER_SIZE), reads_chunk, barcodes_chunk))
        else:
            #this is a pipeline for reviwer expts only
            #works quite poorly, see simulation results
            assignments = pool.map(
                assign_read_levenshtein,
                zip(repeat(consensus_bcs), reads_chunk, barcodes_chunk))

        for (assignment, offset1, offset2) in assignments:
            if (assignment == 'unassigned'):
                num_unassigned += 1
            reads_assigned[assignment].append((offset1, offset2))
        print('\tProcessed %i reads' % read_count)

        #pickle dump read assignments every 10m reads
        if read_count % PICKLE_SIZE == 0:
            pickle_files.append(IO_utils.write_to_pickle(reads_assigned))
            reads_assigned = initialize_reads_assigned(consensus_bcs)

    pickle_files.append(IO_utils.write_to_pickle(reads_assigned))

    reads_f.close()
    barcodes_f.close()
    pool.close()

    print('\t%i reads could not be assigned' % num_unassigned)
    return pickle_files