Exemple #1
0
def measure_length_in_fastq(fastq, split_length, coord1, coord2):
    """Measure the distances between 2 coordinates in rDNA for a set of reads.

    This module takes a .fastq file that contains many multiple rDNA containing
    reads and measures the length for each of them and returns the result.
    Coordinate1 should be smaller than coordinate 2.

    Args:
        fastq (str): fastq filename
        split_length (int): split length when the read is mapped to rDNA
        coordinate1 (int): coordinate in rDNA
        coordinate2 (int): coordinate in rDNA
    Returns:
        result: list of (header, lisf of measured distance between 2 coords)
    """
    with open(fastq) as f:
        result = []
        for data in itertools.zip_longest(*[iter(f)] * 4):
            header = data[0].strip()
            read = data[1].strip()
            quality = data[3].strip()
            make_temp_fastq(split_length, header, read, quality)
            subprocess.run(
                'bwa mem -M -x ont2d -t 5 /home/yutaro/nanopore/'
                'clive/rDNA_index/humRibosomal.fa temp_fastq.fastq'
                ' > temp_sam.sam',
                shell=True,
                stdout=FNULL,
                stderr=subprocess.STDOUT)
            with open('temp_sam.sam') as samf:
                samdata = samf.readlines()[2:]
            if is_rDNAs_healthy(samdata):
                dist = measure_length_main(header, read, coord1, coord2)
                result.append((header.split()[0], dist))
    return result
Exemple #2
0
def analyze_fastq_by_header(fastq, header_id, split_length):
    """Find a read from a fastq file and analyze its structure.

    Args:
        fastq (str): filename
        header_id (str): header id
        split_length (int): split length
    Returns:
        TRs: TRs
    """
    with open(fastq) as f:
        for line in f:
            if header_id in line:
                header = line
                read = f.readline()
                f.readline()
                quality = f.readline()
                make_temp_fastq(split_length, header, read, quality)
                FNULL = open(os.devnull, 'w')
                subprocess.run(
                    'bwa mem -M -x ont2d -t 5 /home/yutaro/nanopore/'
                    'clive/rDNA_index/humRibosomal.fa '
                    'temp_files/temp_fastq.fastq > temp_files/'
                    'single_split_mapped.sam',
                    shell=True,
                    stdout=FNULL,
                    stderr=subprocess.STDOUT)
                break
        TRs = analyze_split_reads('temp_files/single_split_mapped.sam',
                                  split_length)
        return TRs
Exemple #3
0
def analyze_all_fastq_file(fastq, split_length, length_cutoff):
    """Perform TR analysis for a fastq and output a summarized list.

    For each element in set_of_TRs, true TRs is element[1]!

    Args:
        fastq (str): fastq filename
        split_length (int): split length
        length_cutoff (int): reads shorter than this are omitted
    Returns:
        set_of_TRs: each TRs obtained from analyze_split_reads are paired with
        header as [header, TRs] and summarized as set_of_TRs
    """
    # this function read a fastq file and split each read and then map them to rDNA.
    # Each mapped read is analyzed by analyze_split_reads to make TRs.
    count = 0
    set_of_TRs = []
    with open(fastq) as f:
        for n, line in enumerate(f):
            if n % 4 == 0:
                header = line.strip()
            if n % 4 == 1:
                read = line.strip()
            if n % 4 == 3:
                quality = line.strip()
                if len(quality) < length_cutoff:
                    continue
                else:
                    count += 1
                    make_temp_fastq(split_length, header, read, quality)
                    FNULL = open(os.devnull, 'w')
                    subprocess.run(
                        'bwa mem -M -x ont2d -t 5 /home/yutaro/nanopore/clive/rDNA_index/humRibosomal.fa temp_files/temp_fastq.fastq > temp_files/temp_sam.sam',
                        shell=True,
                        stdout=FNULL,
                        stderr=subprocess.STDOUT)
                    TRs = analyze_split_reads('temp_files/temp_sam.sam',
                                              split_length)
                    set_of_TRs.append([header, TRs])
    return set_of_TRs
Exemple #4
0
def find_boundaries_from_fastq(fastq, split_length):
    """Find boundary containing reads from a .fastq file.

    Args:
        fastq (str): filename
        split_length (split_length): split_length
    Returns:
        boundaries: list of (boundary coordinate, boundary rDNA coordinate,
                             direction, side of non rDNA, read, header)
    """
    with open(fastq) as f:
        boundaries = []
        for n, each_fastq in enumerate(itertools.zip_longest(*[iter(f)] * 4)):
            header = each_fastq[0].strip()
            read = each_fastq[1].strip()
            if len(read) < 40000:
                continue
            quality = each_fastq[3].strip()
            make_temp_fastq(split_length, header, read, quality)
            subprocess.run(
                'bwa mem -M -x ont2d -t 5 '
                '/home/yutaro/nanopore/clive/rDNA_index/'
                'humRibosomal.fa temp_files/temp_fastq.fastq > '
                'temp_files/temp_sam.sam',
                shell=True,
                stdout=FNULL,
                stderr=subprocess.STDOUT)
            # rDNA_coordinate=1 when using find_true_boundary2
            temp_boundary = find_end_reads('temp_files/temp_sam.sam',
                                           split_length,
                                           rDNA_coordinate=1)
            if temp_boundary:
                header = header.split()[0]
                boundary = int(temp_boundary[0])
                direction = temp_boundary[1]
                side = temp_boundary[2]
                if direction == '+':
                    if side == 'right':
                        bound_seq = read[boundary:boundary + 10000]
                        with open('boundary_seq1.fa', 'a') as fw:
                            fw.write('>' + header + '\n')
                            fw.write(bound_seq + '\n\n')
                    else:
                        with open('boundary_seq2.fa', 'a') as fw:
                            fw.write('>' + header + '\n')
                            fw.write(bound_seq + '\n\n')
                else:
                    bound_seq = read[boundary - 10000:boundary]
                    revcom = str(Seq(bound_seq).reverse_complement())
                    if side == 'left':
                        with open('boundary_seq1.fa', 'a') as fw:
                            fw.write('>' + header + '\n')
                            fw.write(revcom + '\n\n')
                    else:
                        with open('boundary_seq2.fa', 'a') as fw:
                            fw.write('>' + header + '\n')
                            fw.write(revcom + '\n\n')

                plot_read_structure(header,
                                    split_length,
                                    savename='end_reads/' + header + '.png',
                                    title=str(temp_boundary[0]))
                continue
                true_boundary = find_true_boundary2(header, read, quality,
                                                    temp_boundary)
                if true_boundary:
                    boundaries.append(
                        (true_boundary[0], true_boundary[1], temp_boundary[1],
                         temp_boundary[2], read, header.split()[0]))
        return boundaries
Exemple #5
0
id2scores = {}
with h5py.File('megalodon_results/basecalls.modified_base_scores.hdf5') as f:
    for read_id in rDNA_read_ids:
        mc_data = np.array(f['Reads'][read_id])[:, 1]
        ma_data = np.array(f['Reads'][read_id])[:, 0]
        read = id2read[read_id]
        cpgs = find_cpg(read)
        scores_c = []
        for i in cpgs:
            scores_c.append([i, mc_data[i]])
        id2scores[read_id] = scores_c

split_length = 200
for read_id in rDNA_read_ids:
    read = id2read[read_id]
    search_rDNA_reads.make_temp_fastq(split_length, read_id, read,
                                      'J' * len(read))
    FNULL = open(os.devnull, 'w')
    subprocess.run(
        'bwa mem -M -x ont2d -t 5 reference/rDNA_for_cas9_no_margin.fasta '
        'temp_files/temp_fastq.fastq > temp_files/'
        'single_split_mapped.sam',
        shell=True,
        stdout=FNULL,
        stderr=subprocess.STDOUT)
    lc = plot_read_structure('test', split_length, 9400)
    read_len = len(read)
    fig = plt.figure()
    plt.subplots_adjust(left=0.2)
    ax = fig.add_subplot()
    x = []
    y = []