Ejemplo n.º 1
0
def build_subgraph(reads_in_subgraph, barcodes_unzipped):
	bc_file = open(barcodes_unzipped, 'rb')
	barcodes_iter = IO_utils.read_fastq_random(
		bc_file, offsets = reads_in_subgraph)
	subgraph_kmer_counts = Counter()
	while(True):
		try:
			barcode_data, _ = next(barcodes_iter)
		except StopIteration:
			break	
		read_kmers = IO_utils.get_cyclic_kmers(
			barcode_data, 
			int(args['kmer_size']),
			int(args['barcode_start']), 
			int(args['barcode_end']))		
		for (kmer, _ ) in read_kmers:
			subgraph_kmer_counts[kmer] += 1
	bc_file.close()
	
	edges = []
	for(kmer, count) in subgraph_kmer_counts.items():
		edge = Edge(kmer[0:-1], kmer[1:], count)
		edges.append(edge)
	subgraph = Graph(edges)
	return subgraph
Ejemplo n.º 2
0
def write_split_fastqs(params):
    import gzip
    (consensus_bcs, reads_assigned_db, reads_assigned_pipe, output_dir,
     reads_unzipped, barcodes_unzipped) = params

    split_dir = '%s/reads_split' % output_dir
    if not os.path.exists(split_dir):
        os.makedirs(split_dir)
    output_files = {'batch': '%s/batch.txt' % (split_dir)}
    batch_file = open(output_files['batch'], 'w')

    reads_per_cell = {}
    consensus_bcs.add('unassigned')

    for cell in consensus_bcs:

        try:
            cell_offsets = IO_utils.get_from_db(reads_assigned_pipe, [cell])
        except IndexError:
            pass

        #cell_offsets = IO_utils.read_from_pickle(reads_assigned_pickled, cell)
        cell_name = 'cell_%s' % cell

        #initialie all readers and writers
        output_files[cell_name] = {
            'reads': '%s/%s_reads.fastq.gz' % (split_dir, cell_name),
            'barcodes': '%s/%s_barcodes.fastq.gz' % (split_dir, cell_name),
            'umi': '%s/%s.umi.txt' % (split_dir, cell_name)
        }
        batch_file.write('%s\t%s\t%s\n' % \
         (cell_name,
         output_files[cell_name]['umi'],
         output_files[cell_name]['reads']))
        reads_writer = gzip.open(output_files[cell_name]['reads'], 'wb')
        barcodes_writer = gzip.open(output_files[cell_name]['barcodes'], 'wb')
        umi_writer = open(output_files[cell_name]['umi'], 'wb')
        reads_f = open(reads_unzipped, 'rb')
        barcodes_f = open(barcodes_unzipped, 'rb')

        reads_iter = IO_utils.read_fastq_random(
            reads_f,
            offsets=[
                cell_offsets[i] for i in range(len(cell_offsets)) if i % 2 == 0
            ])
        barcodes_iter = IO_utils.read_fastq_random(
            barcodes_f,
            offsets=[
                cell_offsets[i] for i in range(len(cell_offsets)) if i % 2 == 1
            ])
        reads_in_cell = 0
        while (True):
            try:
                reads_data, _ = next(reads_iter)
                barcodes_data, _ = next(barcodes_iter)
                reads_in_cell += 1
            except StopIteration:
                break

            reads_data[0] += ' %s' % cell_name.replace('_', ':')
            reads_data[0] = reads_data[0].replace(' ', '_')
            barcodes_data[0] += ' %s' % cell_name.replace('_', ':')
            barcodes_data[0] = barcodes_data[0].replace(' ', '_')

            umi = barcodes_data[1][int(args['umi_start']):int(args['umi_end'])]
            reads_writer.write(('\n'.join(reads_data) + '\n').encode('utf-8'))
            barcodes_writer.write(
                ('\n'.join(barcodes_data) + '\n').encode('utf-8'))
            umi_writer.write((umi + '\n').encode('utf-8'))

        reads_writer.close()
        umi_writer.close()
        barcodes_writer.close()
        reads_f.close()
        barcodes_f.close()

        print('\tWrote %i reads to file:\t%s' % \
         (reads_in_cell, cell_name))
        reads_per_cell[cell] = reads_in_cell
    batch_file.close()
    return output_files, reads_per_cell