Beispiel #1
0
def write_merged_to_stdout():
    # We assume for now all records match. Generally, need to verify.
    log('About to merge ', in_fname1, in_fname2)
    with gene_lib.open_compressed(in_fname1, 'rt') as in_f1_handle:
        records1 = SeqIO.parse(in_f1_handle, format="fastq")
        with gene_lib.open_compressed(in_fname2, 'rt') as in_f2_handle:
            records2 = SeqIO.parse(in_f2_handle, format="fastq")
            merged = merged_paired_ends(records1, records2)
            Bio.SeqIO.write(merged, handle=sys.stdout, format='fastq')
def fastq_zst_records(filename):
    # https://github.com/indygreg/python-zstandard - pip3 install zstandard
    import zstandard as zstd
    log(f"Reading {filename}...")
    with open(filename, 'rb') as fastq_zst_handle:
        fastq_handle = zstd.ZstdDecompressor().stream_reader(fastq_zst_handle)
        # wrapper adds support for .readline(), for line in ...
        fastq_text = io.TextIOWrapper(fastq_handle, encoding='ascii')
        for record in SeqIO.parse(fastq_text, "fastq"):
            yield record
Beispiel #3
0
def write_fastq_to_stdout():
    log('About to transform ', in_fname)
    with gene_lib.open_compressed(in_fname, 'rt') as in_f1_handle:
        records = SeqIO.parse(in_f1_handle, format="fasta")

        for (rec) in records:
            string = str(rec.seq)
            res_seq = SeqRecord(Seq(string),
                                id=rec.id,
                                name=rec.name,
                                description=rec.description,
                                letter_annotations={
                                    "phred_quality":
                                    [30 for i in range(len(string))]
                                })
            Bio.SeqIO.write(res_seq, handle=sys.stdout, format='fastq')
Beispiel #4
0
def split_recompress(in_fname, out_basename, skip=[]):
    """
    skip - indexes of parts to skip (still need to decompress but not compress)
    """
    with gzip.open(in_fname, 'rb') as in_fastq:
        for i, chunk_iter in enumerate(chunks(in_fastq, chunk_size=100_000_000)):
            out_fname = f'{out_basename}.part{i}e8.fastq.gz'
            t0 = time.time()
            if i in skip:
                log('Skipping', out_fname)
                max(chunk_iter)  # consume iterator, discarding values
            else:
                log('Writing', out_fname)
                with gzip.open(out_fname + '.tmp', 'wb', compresslevel=2) as out_fastq_gz:
                #with open(out_fname, 'wb') as out_fastq_gz:
                    out_fastq_gz.writelines(chunk_iter)
                os.rename(out_fname + '.tmp', out_fname)
            t1 = time.time()
            log(t1 - t0, 'sec.')
Beispiel #5
0
def merged_paired_ends(records1, records2):
    tot_good = 0
    tot_great = 0
    tot = 0
    #    log('in merged_paired_ends',records1,records2)
    for (rec1, rec2) in zip(records1, records2):
        tot += 1
        str1 = str(rec1.seq)
        str2 = str(rec2.seq.reverse_complement())
        #        log('-------------------------------------------\n matching ',str1,'\n',str2,'\n===================================================')
        end1 = str1[-common_req:]
        re = tre.compile(end1, tre.EXTENDED)
        # we expect small errors here
        res_seq = None
        match = re.search(str2, tre.Fuzzyness(maxerr=init_err))
        if match:
            tot_good += 1
            match_loc = match.groups()[0][0]
            to_search_len = match_loc + common_req
            fuzzyness = max(tot_err, ceil(0.1 * to_search_len))
            re = tre.compile(str1[-to_search_len:], tre.EXTENDED)
            match_tot = re.search(str2, tre.Fuzzyness(maxerr=fuzzyness))
            #           log('step1: matched ',end1,' at',match_loc,' testing prefix ',str2[:to_search_len],'cost ',match.cost)
            if match_tot:
                #    if (tot_good % 100 == 0):
                #        log('fuzzyness = ', fuzzyness)
                #              log('step2: matched ',str1[-to_search_len:],' at',match_tot.groups()[0][0],' testing prefix ','cost ',match.cost)
                tot_great += 1
                # An arbitrary decision: take the common string from r2
                res_str = str1[:-to_search_len] + str2
                # TODO: preserve qualities
                res_seq = SeqRecord(Seq(res_str),
                                    id=rec1.id,
                                    name=rec1.name,
                                    description=rec1.description,
                                    letter_annotations={
                                        "phred_quality":
                                        [30 for i in range(len(res_str))]
                                    })
                if (tot_great % step == 0):
                    log('nicely matched ', str1, '\n', str2, to_search_len,
                        match_tot.group(0), match.group(0), match_tot.cost,
                        match.cost)
#             log('result = ',str(res_seq.seq))
                yield res_seq
                continue

        res_str = str1 + ('N' * padding) + str2
        res_seq = SeqRecord(Seq(res_str),
                            id=rec1.id,
                            name=rec1.name,
                            description=rec1.description,
                            letter_annotations={
                                "phred_quality":
                                [30 for i in range(len(res_str))]
                            })
        if (tot % step == 0):
            log(tot, tot_good, tot_great)
        # log('matched ',str1,'\n',str2, len(str1), len(str2))
#      log('result = ',str(res_seq.seq))
        yield res_seq
        re = tre.compile(sine[:sine_header], tre.EXTENDED)
        fuzziness = tre.Fuzzyness(maxerr=maxerr)

        for rec in records:
            if reverse_complement:
                cur_seq = rec.seq.reverse_complement()
            else:
                cur_seq = rec.seq

            match = re.search(str(cur_seq), fuzziness)
            if match:
                # log(rec.seq)
                #sine_location = match.groups() #returns tuple of tuples (in this case: ((2,78), ) for example
                SeqIO.write(rec, sys.stdout, 'fastq')


# Writes to stdout, uncompresed
[sine_fname, header_len, max_error, reverse_complement,
 merged_input_fname] = sys.argv[1:]
if reverse_complement not in {"forward", "rc"}:
    raise ValueError('reverse_complement arg must be "forward" or "rc"')

log(f"About to screen {merged_input_fname} ({reverse_complement}) for {sine_fname} first {header_len} up to {max_error} err"
    )
sine = gene_lib.get_sine_forward(sine_fname)  #"B1.fasta"
filter_potential_sines(in_fname=merged_input_fname,
                       sine_string=sine,
                       sine_header=int(header_len),
                       maxerr=int(max_error),
                       reverse_complement=(reverse_complement == "rc"))
Beispiel #7
0
#from Bio.SeqRecord import SeqRecord

import gene_lib
from gene_lib import log
from gene_lib import get_sine_forward


def filter_potential_sines(in_fname, sine_string, sine_header=67, maxerr=19):
    """
    Finds candidate SINEs with a certain distance from a prefix length.
    To be used for preliminary screening (input for later steps).
    """
    with gene_lib.open_any(in_fname, 'rt') as in_file_handle:
        records = SeqIO.parse(in_file_handle, format="fastq")
        re = tre.compile(sine[:sine_header], tre.EXTENDED)
        fuzziness = tre.Fuzzyness(maxerr=maxerr)

        for rec in records:
            match = re.search(str(rec.seq), fuzziness)
            if match:
                # log(rec.seq)
                #sine_location = match.groups() #returns tuple of tuples (in this case: ((2,78), ) for example
                SeqIO.write(rec, sys.stdout, 'fastq')


# Writes to stdout, uncompresed
[sine_fname, merged_input_fname] = sys.argv[1:]
log('About to screen', merged_input_fname, 'for', sine_fname)
sine = gene_lib.get_sine_forward(sine_fname)  #"B1.fasta"
filter_potential_sines(merged_input_fname, sine)
Beispiel #8
0
                              itertools.islice(shared_input_iterator, chunk_size - 1))

def split_recompress(in_fname, out_basename, skip=[]):
    """
    skip - indexes of parts to skip (still need to decompress but not compress)
    """
    with gzip.open(in_fname, 'rb') as in_fastq:
        for i, chunk_iter in enumerate(chunks(in_fastq, chunk_size=100_000_000)):
            out_fname = f'{out_basename}.part{i}e8.fastq.gz'
            t0 = time.time()
            if i in skip:
                log('Skipping', out_fname)
                max(chunk_iter)  # consume iterator, discarding values
            else:
                log('Writing', out_fname)
                with gzip.open(out_fname + '.tmp', 'wb', compresslevel=2) as out_fastq_gz:
                #with open(out_fname, 'wb') as out_fastq_gz:
                    out_fastq_gz.writelines(chunk_iter)
                os.rename(out_fname + '.tmp', out_fname)
            t1 = time.time()
            log(t1 - t0, 'sec.')

#split_recompress('Old-lung/old_lung_R1_001.fastq.gz', 'Old-lung/old_lung_R1_001')

if __name__ == '__main__':
    [in_fname, out_basename, *skip] = sys.argv[1:]
    skip = [int(i) for i in skip]
    split_recompress(in_fname, out_basename, skip)
    log('REMOVING', in_fname)
    os.remove(in_fname)  # may fail for /dev/stdin, /dev/fd/... etc. but that's OK