def measure_length_in_fastq(fastq, split_length, coord1, coord2): """Measure the distances between 2 coordinates in rDNA for a set of reads. This module takes a .fastq file that contains many multiple rDNA containing reads and measures the length for each of them and returns the result. Coordinate1 should be smaller than coordinate 2. Args: fastq (str): fastq filename split_length (int): split length when the read is mapped to rDNA coordinate1 (int): coordinate in rDNA coordinate2 (int): coordinate in rDNA Returns: result: list of (header, lisf of measured distance between 2 coords) """ with open(fastq) as f: result = [] for data in itertools.zip_longest(*[iter(f)] * 4): header = data[0].strip() read = data[1].strip() quality = data[3].strip() make_temp_fastq(split_length, header, read, quality) subprocess.run( 'bwa mem -M -x ont2d -t 5 /home/yutaro/nanopore/' 'clive/rDNA_index/humRibosomal.fa temp_fastq.fastq' ' > temp_sam.sam', shell=True, stdout=FNULL, stderr=subprocess.STDOUT) with open('temp_sam.sam') as samf: samdata = samf.readlines()[2:] if is_rDNAs_healthy(samdata): dist = measure_length_main(header, read, coord1, coord2) result.append((header.split()[0], dist)) return result
def analyze_fastq_by_header(fastq, header_id, split_length): """Find a read from a fastq file and analyze its structure. Args: fastq (str): filename header_id (str): header id split_length (int): split length Returns: TRs: TRs """ with open(fastq) as f: for line in f: if header_id in line: header = line read = f.readline() f.readline() quality = f.readline() make_temp_fastq(split_length, header, read, quality) FNULL = open(os.devnull, 'w') subprocess.run( 'bwa mem -M -x ont2d -t 5 /home/yutaro/nanopore/' 'clive/rDNA_index/humRibosomal.fa ' 'temp_files/temp_fastq.fastq > temp_files/' 'single_split_mapped.sam', shell=True, stdout=FNULL, stderr=subprocess.STDOUT) break TRs = analyze_split_reads('temp_files/single_split_mapped.sam', split_length) return TRs
def analyze_all_fastq_file(fastq, split_length, length_cutoff): """Perform TR analysis for a fastq and output a summarized list. For each element in set_of_TRs, true TRs is element[1]! Args: fastq (str): fastq filename split_length (int): split length length_cutoff (int): reads shorter than this are omitted Returns: set_of_TRs: each TRs obtained from analyze_split_reads are paired with header as [header, TRs] and summarized as set_of_TRs """ # this function read a fastq file and split each read and then map them to rDNA. # Each mapped read is analyzed by analyze_split_reads to make TRs. count = 0 set_of_TRs = [] with open(fastq) as f: for n, line in enumerate(f): if n % 4 == 0: header = line.strip() if n % 4 == 1: read = line.strip() if n % 4 == 3: quality = line.strip() if len(quality) < length_cutoff: continue else: count += 1 make_temp_fastq(split_length, header, read, quality) FNULL = open(os.devnull, 'w') subprocess.run( 'bwa mem -M -x ont2d -t 5 /home/yutaro/nanopore/clive/rDNA_index/humRibosomal.fa temp_files/temp_fastq.fastq > temp_files/temp_sam.sam', shell=True, stdout=FNULL, stderr=subprocess.STDOUT) TRs = analyze_split_reads('temp_files/temp_sam.sam', split_length) set_of_TRs.append([header, TRs]) return set_of_TRs
def find_boundaries_from_fastq(fastq, split_length): """Find boundary containing reads from a .fastq file. Args: fastq (str): filename split_length (split_length): split_length Returns: boundaries: list of (boundary coordinate, boundary rDNA coordinate, direction, side of non rDNA, read, header) """ with open(fastq) as f: boundaries = [] for n, each_fastq in enumerate(itertools.zip_longest(*[iter(f)] * 4)): header = each_fastq[0].strip() read = each_fastq[1].strip() if len(read) < 40000: continue quality = each_fastq[3].strip() make_temp_fastq(split_length, header, read, quality) subprocess.run( 'bwa mem -M -x ont2d -t 5 ' '/home/yutaro/nanopore/clive/rDNA_index/' 'humRibosomal.fa temp_files/temp_fastq.fastq > ' 'temp_files/temp_sam.sam', shell=True, stdout=FNULL, stderr=subprocess.STDOUT) # rDNA_coordinate=1 when using find_true_boundary2 temp_boundary = find_end_reads('temp_files/temp_sam.sam', split_length, rDNA_coordinate=1) if temp_boundary: header = header.split()[0] boundary = int(temp_boundary[0]) direction = temp_boundary[1] side = temp_boundary[2] if direction == '+': if side == 'right': bound_seq = read[boundary:boundary + 10000] with open('boundary_seq1.fa', 'a') as fw: fw.write('>' + header + '\n') fw.write(bound_seq + '\n\n') else: with open('boundary_seq2.fa', 'a') as fw: fw.write('>' + header + '\n') fw.write(bound_seq + '\n\n') else: bound_seq = read[boundary - 10000:boundary] revcom = str(Seq(bound_seq).reverse_complement()) if side == 'left': with open('boundary_seq1.fa', 'a') as fw: fw.write('>' + header + '\n') fw.write(revcom + '\n\n') else: with open('boundary_seq2.fa', 'a') as fw: fw.write('>' + header + '\n') fw.write(revcom + '\n\n') plot_read_structure(header, split_length, savename='end_reads/' + header + '.png', title=str(temp_boundary[0])) continue true_boundary = find_true_boundary2(header, read, quality, temp_boundary) if true_boundary: boundaries.append( (true_boundary[0], true_boundary[1], temp_boundary[1], temp_boundary[2], read, header.split()[0])) return boundaries
id2scores = {} with h5py.File('megalodon_results/basecalls.modified_base_scores.hdf5') as f: for read_id in rDNA_read_ids: mc_data = np.array(f['Reads'][read_id])[:, 1] ma_data = np.array(f['Reads'][read_id])[:, 0] read = id2read[read_id] cpgs = find_cpg(read) scores_c = [] for i in cpgs: scores_c.append([i, mc_data[i]]) id2scores[read_id] = scores_c split_length = 200 for read_id in rDNA_read_ids: read = id2read[read_id] search_rDNA_reads.make_temp_fastq(split_length, read_id, read, 'J' * len(read)) FNULL = open(os.devnull, 'w') subprocess.run( 'bwa mem -M -x ont2d -t 5 reference/rDNA_for_cas9_no_margin.fasta ' 'temp_files/temp_fastq.fastq > temp_files/' 'single_split_mapped.sam', shell=True, stdout=FNULL, stderr=subprocess.STDOUT) lc = plot_read_structure('test', split_length, 9400) read_len = len(read) fig = plt.figure() plt.subplots_adjust(left=0.2) ax = fig.add_subplot() x = [] y = []