def generate_sample_reads(): sample_genome = np.array(common.convert_genome_str_to_int("TAATGCCATGGGATGTT")) sample_read_length = 3 sample_read_pair_gap = 1 num_reads = len(sample_genome) - sample_read_length * 2 - sample_read_pair_gap + 1 sample_reads = np.zeros((num_reads, 2, sample_read_length), dtype=np.uint8) for i in xrange(num_reads): sample_reads[i] = [ sample_genome[i : i + sample_read_length], sample_genome[ i + sample_read_length + sample_read_pair_gap : i + sample_read_length * 2 + sample_read_pair_gap ], ] return sample_reads
def get_reads(dataset_name, read_length=50): with open("dataset/%s/reads.txt" % dataset_name, "r") as f: lines = f.readlines() read_length = len(lines[1].rstrip().split(',')[0]) all_reads = np.zeros((len(lines) - 1, 2, read_length), dtype=np.uint8) pair_idx = 0 for line in lines[1:]: line = line.rstrip() raw_read_pairs = line.split(',') int_read_pairs = map(lambda x: common.convert_genome_str_to_int(x), raw_read_pairs) if len(int_read_pairs[1]) < read_length: all_reads[pair_idx, 0] = int_read_pairs[0] for i in xrange(len(int_read_pairs[1])): all_reads[pair_idx, 1, i] = int_read_pairs[1][i] else: all_reads[pair_idx] = int_read_pairs pair_idx += 1 return all_reads