Ejemplo n.º 1
0
 def primer_len_filter(path, sample):
     sequence = dinopy.FastqReader(path)
     assembled = dinopy.FastqWriter(
         path.rsplit("_", 1)[0] + "_assembled.fastq")
     filt_out = dinopy.FastqWriter(
         path.rsplit("_", 1)[0] + "_filtered_out.fastq")
     assembled_counter = 0
     filt_out_counter = 0
     assembled.open()
     filt_out.open()
     for read in sequence.reads(quality_values=True):
         name = read.name.decode()
         seq = check_for_match(read.sequence.decode(), sample)
         if seq[0] and snakemake.params.maxlen >= len(
                 seq[1]) >= snakemake.params.minlen:
             assembled.write(seq[1].encode(),
                             name.split(" ")[0].encode(), read.quality)
             assembled_counter += 1
         else:
             filt_out.write(read.sequence,
                            name.split(" ")[0].encode(), read.quality)
             filt_out_counter += 1
     logging.info("{}: {} sequences were kept, \
             {} sequences were filtered out".format(sample,
                                                    assembled_counter,
                                                    filt_out_counter))
     assembled.close()
     filt_out.close()
Ejemplo n.º 2
0
def get_dedup_coverages(fq_file):

    singleton_pattern = "at_locus:'(\d+)'"
    consensus_pattern = "Locus_(\d+)"
    locus_coverage_counter = Counter()

    fqr = dp.FastqReader(fq_file)
    for r in fqr.reads():
        name = r.name.decode()
        singleton = re.search(singleton_pattern, name)
        consensus = re.findall(consensus_pattern, name)
        if singleton is not None:
            locus = singleton.groups()[0]
            locus_coverage_counter[int(locus)] += 1
        if consensus:
            merged_loci = [int(s) for s in consensus]
            # print(merged_loci)
            if len(set(merged_loci)) > 1:
                raise ValueError("Overmerge!")
            else:
                locus_coverage_counter[list(set(merged_loci))[0]] += 1
        if singleton is None and not consensus:
            raise ValueError("BAD NAME. No matches")
        if singleton is not None and consensus:
            raise ValueError("BAD NAME. Two matches")

    loci, coverage = zip(*locus_coverage_counter.items())
    df = pd.DataFrame({"locus": loci, "after_dedup": coverage})
    return df
Ejemplo n.º 3
0
def split_files(p5_file, p7_file, force):
    fqr_fw = dp.FastqReader(p5_file)
    fqr_rev = dp.FastqReader(p7_file)
    output_files = {}

    for (fw, rev) in zip(fqr_fw.reads(), fqr_rev.reads()):
        # get bthe nameline of the read (forward or reverse doesn't matter)
        nl = rev.name
        items = nl.split()

        # This uses the perfect p7 barcode from the annotation
        # To use the simulated barcode, which can contain sequencing errors,
        # use items[1].split[b":"][-1]
        if items[5].startswith(b"p7_bc"):
            # extract the barcode sequence
            p7_bc = items[5].split(b":")[1].strip(b"'")

        # check if a file writer for the barcode is already available
        if p7_bc not in output_files:
            filename_fw = f"reads_{p7_bc.decode()}_1.fq.gz"
            filename_rev = f"reads_{p7_bc.decode()}_2.fq.gz"
            fqw_fw = dp.FastqWriter(filename_fw, force_overwrite=force)
            fqw_rev = dp.FastqWriter(filename_rev, force_overwrite=force)
            fqw_fw.open()
            fqw_rev.open()
            print(f"\nFound new barcode: {p7_bc.decode()}")
            print(f"Writing to:")
            print(f"  -> {filename_fw}")
            print(f"  -> {filename_rev}")
            output_files[p7_bc] = (fqw_fw, fqw_rev)
        else:
            fqw_fw, fqw_rev = output_files[p7_bc]

        # write reads back to the writer with the chosen barcode
        fqw_fw.write(*fw)
        fqw_rev.write(*rev)
Ejemplo n.º 4
0
def fasta2dazzdb(args: argparse.Namespace):
    """Fix the FASTA/FASTQ header/id's to a DAZZ_DB compatible format such that
    these reads can be imported."""

    file_format = args.format
    if not file_format:
        if args.input != sys.stdin:
            filename = args.input.name
            file_ext = filename[filename.rfind('.')+1:]

            file_format = 'fastq' if file_ext in ('fq', 'fastq') else 'fasta'

    if not file_format:
        logger.error("Could not determine file format. Please specify using "
                     "the -f option.")
        return

    if file_format == 'fastq':
        seq_iter = iter(dinopy.FastqReader(args.input).reads(
            quality_values=False))
    else:
        seq_iter = iter(dinopy.FastaReader(args.input).reads(read_names=True))

    if args.input == sys.stdin:
        name = args.name if args.name else random_string(10)
    else:
        name = os.path.basename(args.input.name)

    moviename = daligner.generate_moviename_hash(name)
    name_mapping = {}
    seq_iter = iter(daligner.fix_header(seq_iter, moviename, name_mapping))

    logger.info("Converting FASTA/FASTQ entries...")
    with dinopy.FastaWriter(args.output, force_overwrite=True) as fw:
        fw.write_entries(seq_iter)

    if args.translations:
        logger.info("Writing name mappings to file...")
        json.dump(name_mapping, args.translations)

    logger.info("Done.")
Ejemplo n.º 5
0
def parse_fq_file(fq_file):

    pcr_counts = defaultdict(PCRRecord)
    fqr = dp.FastqReader(fq_file)

    for read in fqr.reads():

        line_info = parse_info_line(read.name.decode())
        try:
            locus = line_info["at_locus"]
        except:
            print(read, line_info)
            raise

        # count the number of real an PCR reads for this locus
        # tested against a solution with grep + wc -l
        # grep  ^@ data/ddRAGEdataset_ATCACG_1.fastq | grep "at_locus:'1'" | grep -v PCR | wc -l
        if line_info["pcr_copy"]:
            pcr_counts[locus].pcr += 1
        else:
            pcr_counts[locus].real += 1

    return pcr_counts
Ejemplo n.º 6
0
def demultiplexer(file_path_list):
    samples = []
    output_filepaths = []
    for sample in primertable.keys():
        samples.append(sample + '_R1')
        samples.append(sample + '_R2')
        output_filepaths.append('demultiplexed/' + sample + '_R1.fastq.gz')
        output_filepaths.append('demultiplexed/' + sample + '_R2.fastq.gz')

    # Create a dict of writers.
    writers = {
        name: dinopy.FastqWriter(path)
        for name, path in zip(samples, output_filepaths)
    }

    # Open all writers.
    for writer in writers.values():
        writer.open()

    # Start writing.
    for sample in file_path_list:
        sequence = dinopy.FastqReader(sample)
        for read in sequence.reads(quality_values=True):
            for sample in primertable.keys():
                if check_for_match_fwd_demulti(read.sequence.decode(), sample):
                    writers[sample + '_R1'].write(read.sequence, read.name,
                                                  read.quality)
                elif check_for_match_rev_demulti(read.sequence.decode(),
                                                 sample):
                    writers[sample + '_R2'].write(read.sequence, read.name,
                                                  read.quality)
                else:
                    pass

    # Close all writers.
    for writer in writers.values():
        writer.close()
Ejemplo n.º 7
0
"""Merge the p5 and p7 reads from the input files into one read,
joined by join bases.
"""
import sys
import dinopy

# redirect stderr to logfile
sys.stderr = open(snakemake.log[0], "w")

# get and open input files from the calling snakemake rules input directive
p5_file, p7_file, p5_length, p7_length = snakemake.input
print(f"Opening files:\n    {p5_file}\n    {p7_file}", file=sys.stderr)
print(f"Writing to:\n    {snakemake.output.merged}", file=sys.stderr)
p5_reader = dinopy.FastqReader(p5_file)
p7_reader = dinopy.FastqReader(p7_file)

with open(p5_length, "r") as p5_len_file:
    p5_len = int(p5_len_file.readline().strip())

with open(p7_length, "r") as p7_len_file:
    p7_len = int(p7_len_file.readline().strip())

# check if the quality value for the join sequence is valid
join_quality = snakemake.params.join_quality
if len(join_quality) != 1:
    print(
        "Please specify a single Sanger Phred+33 quality value for "
        "join_quality.",
        file=sys.stderr)
    sys.exit(1)
else:
Ejemplo n.º 8
0
"""
"""
import sys
import dinopy
from shutil import copyfile

# redirect stderr to logfile
sys.stderr = open(snakemake.log[0], "w")

p7_reader = dinopy.FastqReader(snakemake.input.fq2)
umi_len = snakemake.params.umi["len"]

# copy p5 file without touching it
copyfile(snakemake.input.fq1, snakemake.output.fq1)

# trim the first UMI-len bases from the p7 read
with dinopy.FastqWriter(snakemake.output.fq2, force_overwrite=True) as writer:
    for seq, name, qual in p7_reader.reads(read_names=True,
                                           quality_values=True):
        writer.write(seq[umi_len:], name, qual[umi_len:])
import dinopy
import sys

in_fastq = dinopy.FastqReader(sys.stdin)
with dinopy.FastqWriter(sys.stdout) as out_fastq:
    out_fastq.write_reads(
        (read.sequence, read.name.split(b" ")[0], read.quality)
        for read in in_fastq.reads())
Ejemplo n.º 10
0
                        filemode="a+",
                        format="%(asctime)-15s %(levelname)-8s %(message)s")
    logging.info(run_id)

WORKING_DIR = os.path.abspath(pathname)
logging.info("WORKING_DIR")
logging.info(WORKING_DIR)
logging.info("input: SRA fastq file")
logging.info(input_id)
logging.info("output: trimmed file")
logging.info(output_file)

input_path = input_id + ".fastq"

out_path_cbsu = os.path.join(WORKING_DIR, output_file)
fqr = dinopy.FastqReader(input_path)

good_reads = OrderedDict()
reads_length = []
pass_quality = 0
pass_length = 0
has_adapter = 0

sample2inadapter = {
    "SRR2078285": "GATCAGCAG",
    "SRR2078286": "ACACAGCAG",
    "SRR2078287": "ACTCAGCAG",
    "SRR2078288": "ACGCAGCAG",
    "SRR2078289": "AGACAGCAG",
    "SRR2078290": "ATCCAGCAG",
    "SRR2078291": "ATGCAGCAG",
Ejemplo n.º 11
0
def read_sorter(primertable):
    if not os.path.exists('demultiplexed/not_sorted'):
        os.mkdir('demultiplexed/not_sorted')
    samples = []
    output_filepaths = []
    for sample in primertable.keys():
        samples.append(sample + snakemake.params.name_ext[:-1] + '1')
        samples.append(sample + snakemake.params.name_ext[:-1] + '2')
        samples.append(sample + '_not_sorted')
        output_filepaths.append('demultiplexed/' + sample + '_R1.fastq.gz')
        output_filepaths.append('demultiplexed/' + sample + '_R2.fastq.gz')
        output_filepaths.append('demultiplexed/not_sorted/' + sample +
                                '_not_sorted.fastq.gz')

    # Create a dict of writers.
    writers = {
        name: dinopy.FastqWriter(path)
        for name, path in zip(samples, output_filepaths)
    }

    # Open all writers.
    for writer in writers.values():
        writer.open()

    # Start writing.
    for sample in primertable.keys():
        fwd = dinopy.FastqReader('../' + data_folder + '/' + sample +
                                 str(snakemake.params.name_ext)[:-1] +
                                 '1.fastq.gz')
        rev = dinopy.FastqReader('../' + data_folder + '/' + sample +
                                 str(snakemake.params.name_ext)[:-1] +
                                 '2.fastq.gz')
        for read_f, read_r in zip(fwd.reads(quality_values=True),
                                  rev.reads(quality_values=True)):
            if check_for_match_sort_fwd(
                    read_f.sequence.decode(),
                    sample.split('/')[-1]) and check_for_match_sort_rev(
                        read_r.sequence.decode(),
                        sample.split('/')[-1]):
                writers[sample + '_R1'].write(read_f.sequence, read_f.name,
                                              read_f.quality)
                writers[sample + '_R2'].write(read_r.sequence, read_r.name,
                                              read_r.quality)
            elif check_for_match_sort_rev(
                    read_f.sequence.decode(),
                    sample.split('/')[-1]) and check_for_match_sort_fwd(
                        read_r.sequence.decode(),
                        sample.split('/')[-1]):
                writers[sample + '_R2'].write(read_f.sequence, read_f.name,
                                              read_f.quality)
                writers[sample + '_R1'].write(read_r.sequence, read_r.name,
                                              read_r.quality)
            else:
                writers[sample + '_not_sorted'].write(read_f.sequence,
                                                      read_f.name,
                                                      read_f.quality)
                writers[sample + '_not_sorted'].write(read_r.sequence,
                                                      read_r.name,
                                                      read_r.quality)

    # Close all writers.
    for writer in writers.values():
        writer.close()
Ejemplo n.º 12
0
import pysam
import dinopy
import numpy as np

log = open(snakemake.log[0], "w")


def parse_clusters(stdout):
    for consensus, size, seqids in csv.reader(stdout, delimiter="\t"):
        # parse seqids and subtract 1 because starcode provides 1-based indices
        yield np.fromiter(map(int, seqids.split(",")), dtype=int) - 1


# load dbr sequences
dbrs = np.array([seq[:snakemake.params.dbr_len].decode()
        for seq in  dinopy.FastqReader(snakemake.input.fq2)
                          .reads(read_names=False, quality_values=False)])

clusters = dict()

# cluster by read sequences
with subprocess.Popen(f"starcode --dist {snakemake.params.seq_dist} --seq-id "
                      f"-1 <(gzip -d -c {snakemake.input.fq1}) -2 <(seqtk trimfq "
                      f"-b {snakemake.params.dbr_len} {snakemake.input.fq2})",
                      shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                      executable="bash", universal_newlines=True) as seqclust:
    cluster_id = 0
    # iterate over clusters
    for seqids in parse_clusters(seqclust.stdout):
        # get DBRs of clustered sequences
        cluster_dbrs = dbrs[seqids]
        # cluster by DBRs