def filter_reads(filter_sequences, min_match_length, input_R1_fp, input_R2_fp,
                 output_R1_fp, output_R2_fp):

    input_R1 = pysam.FastxFile(input_R1_fp)
    input_R2 = pysam.FastxFile(input_R2_fp)

    output_R1 = gzip.open(output_R1_fp, 'w')
    output_R2 = gzip.open(output_R2_fp, 'w')

    reads_filtered = 0
    reads_kept = 0
    for R1, R2 in zip(input_R1, input_R2):

        discard = False
        for f in filter_sequences:
            if contains_adapter(R1.sequence, f,
                                min_match_length) or contains_adapter(
                                    R2.sequence, f, min_match_length):
                discard = True
                break

        if discard:
            reads_filtered += 1
            continue
        else:
            reads_kept += 1

            fq1 = str(R1) + "\n"
            fq2 = str(R2) + "\n"
            output_R1.write(fq1.encode('utf-8'))
            output_R2.write(fq2.encode('utf-8'))

    print(f"reads kept: {reads_kept}, reads filtered: {reads_filtered}")
Exemple #2
0
def make_templ_primer_dic(primerfile,templFasta, type='rna'):
    '''
    Makes dictionary of template-primer pairs, removing pairs if the phred score is low
    '''
    templDic={}
    if type=='rna':
        with ps.FastxFile(templFasta) as templ:
            for entry in templ:
                seq, name = entry.sequence.upper(), entry.name
                templDic[name]=[seq]
        with ps.AlignmentFile(primerfile,"rb") as bam:
            for r in bam.fetch(until_eof=True):
                if r.flag==0 and not any(q<32 for q in r.query_qualities[:6]):
                    templDic[r.qname].append(r.seq[0:6])
                elif r.flag==0 and any(q<32 for q in r.query_qualities[:6]):
                    templDic.pop(r.qname)
    if type=='bs':
        with ps.FastxFile(templFasta) as templ:
            for entry in templ:
                seq, name = entry.sequence.upper(), entry.name.split('_')[0]
                templDic[name]=[seq]
        with ps.FastxFile(primerfile) as f:
            for entry in f:
                if entry.name in templDic.keys():
                    templDic[entry.name].append(entry.sequence[0:6])
    return(templDic)
Exemple #3
0
def fasta_fwd_rev_to_columns(file1, file2=None, output_filename=None):
    """From 2 FASTA files (reverse and forward) adapters, returns 2-columns file

    This is useful for some tools related to adapter removal that takes as input
    this kind of format

    :param str filename1: FASTA format
    :param stsr filename2: FASTA format (optional)

    The files must have a one-to-one mapping
    """
    f1 = pysam.FastxFile(file1)
    if output_filename is not None:
        fout = open(output_filename, "w")

    if file2:
        f2 = pysam.FastxFile(file2)
        for read1, read2 in zip(f1, f2):
            txt = "%s %s" % (read1.sequence, read2.sequence)
            if output_filename is None:
                print(txt)
            else:
                fout.write(txt+"\n")
    else:
        for read1 in f1:
            txt = "%s" % read1.sequence
            if output_filename is None:
                print(read1.sequence)
            else:
                fout.write(txt+"\n")
    if output_filename is not None:
        fout.close()
Exemple #4
0
def multi_process(read_module, read_model, fastq1_in, fastq1_out, sidecar_in, sidecar_out,
                  fastq2_in=None, fastq2_out=None, processes=2, seed=7):
  """

  :param read_module:
  :param read_model:
  :param fastq1_in:
  :param fastq1_out:
  :param sidecar_in:
  :param sidecar_out:
  :param fastq2_in:
  :param fastq2_out:
  :param processes:
  :param seed:
  :return:
  """

  long_qname_table = load_qname_sidecar(sidecar_in)

  seed_rng = np.random.RandomState(seed)

  logger.debug('Starting {} workers'.format(processes))
  in_queue, out_queue = Queue(10000), Queue(10000)
  p_list = [Process(target=worker,
                    args=(i, read_module, read_model, long_qname_table, in_queue, out_queue, seed_rng.randint(SEED_MAX)))
            for i in range(processes)]
  for p in p_list:
    p.start()

  logger.debug('Starting writer process')
  wr = Process(target=writer, args=(fastq1_out, sidecar_out, fastq2_out, out_queue))
  wr.start()

  t0 = time.time()

  # Burn through file
  logger.debug('Starting to read FASTQ file')
  fastq_l = [pysam.FastxFile(fastq1_in)]
  if fastq2_in is not None: fastq_l += [pysam.FastxFile(fastq2_in)]

  cnt = 0
  for cnt, reads in enumerate(zip(*fastq_l)):
    # [(qname, seq, seq) ... ]
    in_queue.put((reads[0].name,) + tuple(r.sequence for r in reads))
    if cnt % 100000 == 0:
      logger.debug('Read {} templates'.format(cnt))

  logger.debug('Stopping child processes')
  for i in range(processes):
    in_queue.put(__process_stop_code__)
  for p in p_list:
    p.join()

  logger.debug('Stopping writer')
  out_queue.put(__process_stop_code__)
  wr.join()

  t1 = time.time()
  logger.debug('Processed {} templates in {:0.2f}s ({:0.2f} t/s)'.format(cnt, t1 - t0, cnt/(t1 - t0)))
Exemple #5
0
def fastq_concat(file):
    '''Convert 2 paired-end fastq files to a single interleaved fastq file'''
    pos_args_len(file, 2)
    with ps.FastxFile(file[0]) as r1_fastq_in, ps.FastxFile(
            file[1]) as r2_fastq_in, open("output_interleaved_R1_R2.fastq",
                                          'w') as fastq_out:
        for entry1, entry2 in zip(r1_fastq_in, r2_fastq_in):
            fastq_out.write(str(entry1) + "\n" + str(entry2) + "\n")
    print("Finished\nFile <output_interleaved_R1_R2.fastq> was created")
Exemple #6
0
def main():

    parser = CommandLineParser()
    barcode_fq = parser.barcode_fastq
    barcode_lib = parser.barcode_library
    outdir = parser.outdir
    barcode_correct_file = os.path.join(outdir, "barcode_correct.txt")

    if barcode_lib == "":
        # Correct barcode
        start_time = time.time()
        print("Start to correct barcode",
              time.strftime("%a %b %d %H:%M:%S %Y", time.localtime()))
        barcode_fq_in = pysam.FastxFile(barcode_fq)
        barcode_correct_file_out = open(barcode_correct_file, "w")
        for reads in barcode_fq_in:
            barcode = reads.sequence
            name = reads.name
            outstr = name + "\t" + barcode + "\t" + barcode + "\n"
            barcode_correct_file_out.write(outstr)
        barcode_correct_file_out.close()
        end_time = time.time()
        print("End", end_time - start_time)

    else:
        # Expand the barcode library
        start_time = time.time()
        print("Start to read barcode library file",
              time.strftime("%a %b %d %H:%M:%S %Y", time.localtime()))
        barcode_lib_dict, barcode_lib_list = GenerateMimatchDict(barcode_lib)
        end_time = time.time()
        print("End", end_time - start_time)

        # Correct barcode
        start_time = time.time()
        print("Start to correct barcode",
              time.strftime("%a %b %d %H:%M:%S %Y", time.localtime()))
        barcode_fq_in = pysam.FastxFile(barcode_fq)
        barcode_correct_file_out = open(barcode_correct_file, "w")
        for reads in barcode_fq_in:
            barcode = reads.sequence
            name = reads.name
            if barcode in barcode_lib_dict:
                for bc_value in list(barcode_lib_dict[barcode]):
                    outstr = name + "\t" + barcode + "\t" + bc_value + "\n"
                    barcode_correct_file_out.write(outstr)
            else:
                continue
        barcode_correct_file_out.close()
        end_time = time.time()
        print("End", end_time - start_time)
Exemple #7
0
def coverage(fastq: Path, asm_length: int) -> int:
    bases_in_reads = 0
    with pysam.FastxFile(fastq) as fq:
        for entry in fq:
            bases_in_reads += len(entry.sequence)

    return int(bases_in_reads / asm_length)
Exemple #8
0
def count_reads_fastx(fasta_filename):
    n_read = 0
    logging.info("Counting reads in {}".format(fasta_filename))
    with pysam.FastxFile(fasta_filename) as fh:
        for entry in fh:
            n_read += 1
    return n_read
Exemple #9
0
def split_fastx(fname, output, chunksize=10000):
    """Split records in a fasta/q into fixed lengths.

    :param fname: input filename.
    :param output: output filename.
    :param chunksize: (maximum) length of output records.
    """
    with open(output, 'w') as fout:
        with pysam.FastxFile(fname, persist=False) as fin:
            for rec in fin:
                name = rec.name
                seq = rec.sequence
                qual = rec.quality
                if rec.comment is None:
                    comment = 'chunk_length={}'.format(chunksize)
                else:
                    comment = '{} chunk_length={}'.format(
                        rec.comment, chunksize)
                if qual is None:
                    for i, s in enumerate(chunks(seq, chunksize)):
                        chunk_name = '{}_chunk{}'.format(name, i)
                        fout.write(">{} {}\n{}\n".format(
                            chunk_name, comment, ''.join(s)))
                else:
                    for i, (s, q) in enumerate(
                            zip(chunks(seq, chunksize),
                                chunks(qual, chunksize))):
                        chunk_name = '{}_chunk{}'.format(name, i)
                        fout.write('@{} {}\n{}\n+\n{}\n'.format(
                            chunk_name, comment, ''.join(s), ''.join(q)))
Exemple #10
0
def main(fastq, output, kind, log_length, bam, downsample):
    """A package for sanity checking (quality control) your long read data.
        Feed it a fastq file and in return you will receive a PDF with four plots:\n
            1. GC content histogram with distribution curve for sample.\n
            2. Jointplot showing the read length vs. phred quality score for each
            read. The interior representation of this plot can be altered with the
            --kind option.\n
            3. Box plot of the phred quality score at positional bins across all reads. The reads are binned into read positions 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11-20, 21-50, 51-100, 101-200, 201-300. Plots from the start of reads.\n
            4. Same as 3, but plots from the end of the read.\n
    Additionally, if you provide a BAM/SAM file a histogram of the read percent
    identity will be added to the report.
    """
    if not any([fastq, bam]):
        raise click.MissingParameter("Either --fastq, --bam or both must be "
                                     "given as arguments.")
    sns.set(style=SEABORN_STYLE)

    # if the specified output is a directory, default pdf name is fastq name.
    if os.path.isdir(output):
        # get the basename of the fastq file and add pdf extension
        basename, ext = os.path.splitext(os.path.basename(fastq or bam))
        # if file is gzipped, need to also strip fastq extension
        if ext == '.gz':
            basename = os.path.splitext(os.path.basename(basename))[0]

        filename = basename + REQUIRED_EXT
        save_as = os.path.join(output, filename)
    else:  # if file name is provided in output, make sure it has correct ext.
        extension = os.path.splitext(output)[-1]
        if extension.lower() != REQUIRED_EXT:
            save_as = output + REQUIRED_EXT
        else:
            save_as = output

    plots_for_report = []
    if fastq:
        with pysam.FastxFile(fastq) as fastq_file:
            # collect the data needed for plotting
            (gc_content,
             read_lengths,
             mean_quality_scores,
             bins_from_start,
             bins_from_end) = utils.collect_fastq_data(fastq_file, downsample)

        # generate plots
        plots_for_report.extend([
            plots.gc_plot(gc_content),
            plots.length_vs_qual_plot(read_lengths, mean_quality_scores,
                                      log_length=log_length, kind=kind),
            plots.quality_per_position(bins_from_start, 'start'),
            plots.quality_per_position(bins_from_end, 'end')
        ])
    if bam:
        # generate read percent identity plot
        perc_identities = utils.sam_percent_identity(bam, downsample)
        plots_for_report.append(plots.percent_identity(perc_identities))

    plots.save_plots_to_pdf(plots_for_report, save_as)

    return 0
Exemple #11
0
def process_tax_id(bam_path):
    """
    """
    try:
        acc_id_list = subprocess.check_output(
            "samtools view -@ {threads} {bam}|cut -f3|sort|uniq".format(
                threads=FLAGS.threads, bam=bam_path),
            encoding='utf-8',
            shell=True).strip().split("\n")
    except subprocess.CalledProcessError:
        print("No accession ID is mapped with the taxonomy ID.")
        sys.exit()
    with pysam.FastxFile(FLAGS.REFSEQ_PATH) as fin:
        for entry in fin:
            if str(entry.name) in acc_id_list:
                with open("ref_{acc_id}.fa".format(acc_id=entry.name),
                          mode='a') as fout, open(
                              "ref_{acc_id}.bed".format(acc_id=entry.name),
                              mode='a') as bed_out:
                    fout.write(str(entry) + '\n')
                    bed_out.write("{name}\t0\t{end}\n".format(
                        name=entry.name, end=len(entry.sequence)))
    with ThreadPoolExecutor(int(FLAGS.threads)) as executor:
        for acc_id in acc_id_list:
            executor.submit(process_accession_num, acc_id)
Exemple #12
0
def parse_fastqs(filename,
                 min_len=0,
                 min_qscore=0,
                 max_start_time=None,
                 min_start_time=None,
                 comments='wrap',
                 channels: Optional[Set[int]] = None):
    with pysam.FastxFile(filename) as fh:
        for entry in fh:
            if min_len and len(entry.sequence) < min_len:
                continue
            if (min_qscore and _compute_mean_qscore(entry.get_quality_array())
                    < min_qscore):
                continue
            if not check_seq_time(entry.comment, max_start_time,
                                  min_start_time):
                continue
            if entry.comment and comments == 'wrap':
                entry.comment = "CO:Z:{}".format(entry.comment)
            elif comments == 'skip':
                entry.comment = None
            if channels is not None and get_channel_from_comment(
                    entry.comment) not in channels:
                continue
            yield entry
def main():
    usage = "\n\n\tusage: target.fasta kmer_len\n\n"

    if len(sys.argv) < 3:
        exit(usage)


    target_fasta_filename = sys.argv[1]
    kmer_len = int(sys.argv[2])

    all_seen_kmers = set()

    with pysam.FastxFile(target_fasta_filename) as fh:
        for entry in fh:
            #print(entry.name)
            #print(entry.sequence)
            sequence = entry.sequence
            seen_kmer_pos = evaluate_kmers(sequence, kmer_len, all_seen_kmers)

            if seen_kmer_pos:
                sequence = mask_kmers(sequence, seen_kmer_pos, kmer_len)

            print(">{}\n{}".format(entry.name, "\n".join(textwrap.wrap(sequence, 60)).rstrip()))


    sys.exit(0)
def get_fastas_from_clusters(in_fasta, clus_dict, nExons=None):

    os.makedirs(config_fasout_dir)

    for clus_name in clus_dict:
        clus_size = len(clus_dict[clus_name])

        # If clusters have been defined by exon composition (get_clusters_from_blast)
        if nExons:
            iso_codes = get_isoform_code(clus_name, nExons)
            fas_out = os.path.join(
                config_fasout_dir, "clus_{0}_{1}seqs.fas".format(
                    "".join(str(x) for x in iso_codes), clus_size))

        # If clusters have been defined by transcripts hit (get_clusters_from_transcript_blast)
        else:
            fas_out = os.path.join(
                config_fasout_dir,
                "clus_{0}_{1}seqs.fas".format(clus_name, clus_size))

        with open(fas_out, 'w') as fout:

            with pysam.FastxFile(in_fasta) as fa:
                for seq in fa:
                    if seq.name in clus_dict[clus_name]:
                        fout.write(">" + seq.name + "\n")
                        fout.write(seq.sequence + "\n")
Exemple #15
0
def compress_basecalls(args):
    """Entry point for RLE compression of a fasta/q file."""
    logger = medaka.common.get_named_logger('Compress_basecalls')

    reads = pysam.FastxFile(args.input)
    if args.threads > 1:
        pool = Pool(args.threads)
        compressed = pool.imap(compress_seq, reads)
    else:
        compressed = (compress_seq(r) for r in reads)

    t0 = now()
    if args.output is None:
        fh = sys.stdout
    else:
        fh = open(args.output, 'w')

    for read in compressed:
        fh.write('@{} {}\n{}\n'.format(read.name, read.comment, read.sequence))
        fh.write('{}\n{}\n'.format('+', read.quality))
    t1 = now()
    logger.info('Compressing {} took {:.3f}s.'.format(args.input, t1 - t0))

    if args.output is not None:
        fh.close()
def truncate_reads(tmp_dir, infile, unaligned_set, n, min_len):
    """
    Writes the n first nucleotids of each sequence in infile to an auxialiary.
    file in the temporary folder.
    Parameters
    ----------
    tmp_dir : str
        Path to the temporary folder.
    infile : str
        Path to the fastq file to truncate.
    unaligned_set : set
        Contains the names of all reads that did not map unambiguously in
        previous rounds.
    n : int
        The number of basepairs to keep in each truncated sequence.
    str
        Path to the output fastq file containing truncated reads.
    """

    outfile = "{0}/truncated.fastq".format(tmp_dir)
    with ps.FastxFile(infile, "r") as inf, open(outfile, "w") as outf:
        for entry in inf:
            if entry.name in unaligned_set or n == min_len:
                entry.sequence = entry.sequence[:n]
                entry.quality = entry.quality[:n]
                outf.write(str(entry) + "\n")
    return outfile
Exemple #17
0
def main(argv=None):

    parser = E.ArgumentParser(descriptin=__doc__)

    parser.add_argument("-f",
                        "--fasta",
                        dest="input_filename_fasta",
                        type=str,
                        help="filename with fasta sequences. ")

    parser.add_argument("-o",
                        "--output-filename-sequences",
                        dest="output_filename_sequences",
                        type=str,
                        help="output per sequence information to filename")

    parser.set_defaults(input_filename_fasta=None, )

    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    if len(unnowns) > 0:
        args.input_filename_fasta = args[0]

    sequence_pairs = []

    if args.input_filename_fasta != "-" and os.path.exists(
            args.input_filename_fasta + ".fai"):
        has_index = 1
        fastafile = pysam.FastaFile(args.input_filename_fasta)
        sequence_pairs = list(zip(fastafile.references, fastafile.lengths))
    else:
        has_index = 0
        iterator = pysam.FastxFile(args.input_filename_fasta)
        for record in iterator:
            sequence_pairs.append((record.name, len(record.sequence)))

    lengths = numpy.array([x[1] for x in sequence_pairs])

    args.stdout.write("\t".join(("has_index", "nsequences", "total_length",
                                 "min_length", "max_length", "median_length",
                                 "mean_length")) + "\n")

    if len(lengths) > 0:
        args.stdout.write("\t".join(
            map(str, (has_index, len(sequence_pairs), lengths.sum(),
                      lengths.min(), lengths.max(), numpy.median(lengths),
                      lengths.mean()))) + "\n")
    else:
        args.stdout.write("\t".join(
            map(str, (has_index, len(sequence_pairs), 0, "", "", "", ""))) +
                          "\n")

    if args.output_filename_sequences:
        with iotools.open_file(args.output_filename_sequences, "w") as outf:
            outf.write("name\tlength\n")
            outf.write(
                "\n".join(["\t".join(map(str, x))
                           for x in sequence_pairs]) + "\n")

    E.stop()
Exemple #18
0
def truncate_reads(tmp_dir, infile, unaligned_set, trunc_len, first_round):
    """Trim read ends

    Writes the n first nucleotids of each sequence in infile to an auxiliary.
    file in the temporary folder.
    Parameters
    ----------
    tmp_dir : str
        Path to the temporary folder.
    infile : str
        Path to the fastq file to truncate.
    unaligned_set : set
        Contains the names of all reads that did not map unambiguously in
        previous rounds.
    trunc_len : int
        The number of basepairs to keep in each truncated sequence.
    first_round : bool
        If this is the first round, truncate all reads without checking mapping.
    
    Returns
    -------
    str :
        Path to the output fastq file containing truncated reads.
    """

    outfile = "{0}/truncated.fastq".format(tmp_dir)
    with ps.FastxFile(infile, "r") as inf, open(outfile, "w") as outf:
        for entry in inf:
            # If the read did not align in previous round or this is the first round
            if (entry.name in unaligned_set) or first_round:
                entry.sequence = entry.sequence[:trunc_len]
                entry.quality = entry.quality[:trunc_len]
                outf.write(str(entry) + "\n")
    return outfile
Exemple #19
0
def get_long_reads(long_reads_fasta_files):
    '''
	Reads long reads from fasta files and creates LongRead objects
	
	Parameters (str):
		long_reads_fasta_files: List of the paths of long reads fasta files

	Returns (dict(int): {long_read_name -> long_read}):
		A dictionary that maps each long_read_name to its LongRead object
	'''

    start_time = time.time()
    print('getting long reads')

    long_reads = {}

    for f in long_reads_fasta_files:
        print('getting long reads from', f, '...')

        with pysam.FastxFile(f) as fastafile:
            for read in fastafile:
                read_name = read.name
                seq = read.sequence

                read_name_sp = read_name.split('/ccs')
                read_name = read_name_sp[0] + '/ccs'

                long_read = LongRead(read_name, seq)
                long_reads[read_name] = long_read

    print('elapsed time =', time.time() - start_time)

    return long_reads
 def _gnrt_SQ_from_fa(self, sf_fa, sf_sq):
     s_sq=""
     with pysam.FastxFile(sf_fa) as fh, open(sf_sq, "w") as fout_sq:
         for entry in fh:
             sinfo="@SQ\tSN:{0}\tLN:{1}\n".format(entry.name, len(entry.sequence))
             fout_sq.write(sinfo)
     return s_sq
Exemple #21
0
def find_sgRNA_in_polyc_regoin(fasta, db):
    '''
    Search polyC region that can be targeted by spCas9 (PAM is NGG)
    '''

    p = re.compile(r'C{6}[ATGC]{14}[ATGC][G]{2}')
    result = collections.namedtuple(
        'PolycGuideRnaResult',
        ['chr', 'start', 'end', 'guide', 'PAM', 'score', 'is_exon'])
    with pysam.FastxFile(fasta) as fh:
        for entry in fh:
            for m in p.finditer(entry.sequence):
                start = m.start()
                end = m.end()
                score_seq = entry.sequence[start - 4:end + 3]
                score = calc_doench_score(score_seq)
                seed_seq = entry.sequence[start + 6:end - 3]
                sgRNA = entry.sequence[start:end]
                pam = sgRNA[-3:]
                if filter_homopolymer(seed_seq):
                    query_iv = HTSeq.GenomicInterval(entry.name, start, end,
                                                     '+')
                    is_exon_overlapped = find_exon(query_iv, db)
                    yield result(entry.name, start, end, sgRNA, pam, score,
                                 is_exon_overlapped)
Exemple #22
0
def parse_fastx_chunk(fn, cs, is_upper=False):
    reads = []
    n_seqs = 0
    n_bases = 0
    size = 0
    with pysam.FastxFile(fn) as f:
        for e in f:
            a = []
            if e.quality:
                if is_upper:
                    reads.append([e.name, e.sequence.upper(), e.quality])
                else:
                    reads.append([e.name, e.sequence, e.quality])
                size += sys.getsizeof(e.name) + sys.getsizeof(
                    e.sequence) + sys.getsizeof(e.quality)
            else:
                if is_upper:
                    reads.append(
                        [e.name,
                         e.sequence.upper(), "!" * len(e.sequence)])
                else:
                    reads.append([e.name, e.sequence, "!" * len(e.sequence)])
                size += sys.getsizeof(e.name) + sys.getsizeof(
                    e.sequence) + sys.getsizeof("!" * len(e.sequence))
            n_seqs += 1
            n_bases += len(e.sequence)
            if size >= cs:
                yield (reads, n_seqs, n_bases)
                size = 0
                reads = []
    yield (reads, n_seqs, n_bases)
Exemple #23
0
def count_kmers_refgen(refgenFasta):
    countDic = {}
    with ps.FastxFile(refgenFasta) as f:
        for entry in f:
            countDic[entry.name] = find_kmers(
                (entry.sequence.upper(), 6, None))
    return (countDic)
Exemple #24
0
    def get_chemistry(self, fq1):
        results = defaultdict(int)

        with pysam.FastxFile(fq1) as fh:
            for _ in range(self.n_read):
                entry = fh.__next__()
                seq = entry.sequence
                chemistry = self.seq_chemistry(seq)
                if chemistry:
                    results[chemistry] += 1
        # if it is 0, then no other linker types
        if results["scopeV2.2.1"] != 0:
            results["scopeV2.2.1"] += results["scopeV2.1.1"]
        sorted_counts = sorted(results.items(),
                               key=lambda x: x[1],
                               reverse=True)
        self.get_chemistry.logger.info(sorted_counts)

        chemistry, read_counts = sorted_counts[0][0], sorted_counts[0][1]
        percent = float(read_counts) / self.n_read
        if percent < 0.5:
            self.get_chemistry.logger.warning(
                "Valid chemistry read counts percent < 0.5")
        if percent < 0.1:
            self.get_chemistry.logger.error(
                "Valid chemistry read counts percent < 0.1")
            raise Exception(
                'Auto chemistry detection failed! '
                'If the sample is from Singleron, ask the technical staff you are connecting with for the chemistry used. '
                'You need to use `--chemistry scopeV1` for scopeV1, and `--chemistry auto` should be fine for scopeV2 and V3 '
            )
        Chemistry.get_chemistry.logger.info(f'chemistry: {chemistry}')
        return chemistry
Exemple #25
0
def transform_read_data(fastq_singleend: str, sample_name: str,
                        fastq_barcode: str, fastq_biological: str,
                        output_path: str) -> Tuple[str, str]:
    """Based on the original Bash script, `transform_read_data.sh`."""
    table = str.maketrans("ACTG", "TGAC")  # {65: 84, 67: 71, 84: 65, 71: 67}
    with pysam.FastxFile(fastq_singleend) as fq, \
     open(fastq_barcode, 'w') as r1, \
     open(fastq_biological, 'w') as tcr:
        for entry in fq:
            qname, trailing = entry.name.split('#')
            qname1 = qname + "#/1\n"
            qname2 = qname + "#/2\n"
            r1.write('@' + qname1)
            tcr.write('@' + qname2)

            ids = re.search(r"[AGCTN]*", trailing)[0]
            r1.write(ids + '\n')
            revcomp = entry.sequence.translate(
                table)[::-1]  # 5'-3' cDNA => 3'->5' RNA
            tcr.write(revcomp + '\n')

            r1.write('+' + qname1)
            tcr.write('+' + qname2)

            r1.write('@' * len(ids) + '\n')
            tcr.write(entry.quality + '\n')

#TODO: test this with just the line below
    #rev_comp_one_fastq(fastq_singleend,sample_name,fastq_biological,output_path,fastq_barcode)
    return fastq_barcode, fastq_biological
def main():

    usage = "\n\n\tusage: {} target.fasta max_frac_masked\n\n".format(
        sys.argv[0])

    if len(sys.argv) < 3:
        exit(usage)

    target_fasta_filename = sys.argv[1]
    max_frac_masked = float(sys.argv[2])

    with pysam.FastxFile(target_fasta_filename) as fh:
        for entry in fh:
            seqname = entry.name
            sequence = entry.sequence
            seqlen = len(sequence)
            num_Ns = count_Ns(sequence)

            frac_masked = "{:.3f}".format(num_Ns / seqlen)

            print("\t".join([seqname, frac_masked]), file=sys.stderr)

            if float(frac_masked) <= max_frac_masked:
                print(">{}\n{}".format(
                    seqname, "\n".join(textwrap.wrap(sequence, 60)).rstrip()))
            else:
                print("\t** excluding {} as frac masked={}".format(
                    seqname, frac_masked),
                      file=sys.stderr)

    sys.exit(0)
    def realign_clipped_read_with_polyA(self, sf_ref, sf_reads, sf_out_sam):
        sf_reads1=sf_reads+".non_polyA.fq"
        sf_reads2=sf_reads+".polyA.fq"

        with pysam.FastxFile(sf_reads) as fh, open(sf_reads1,"w") as fout_non, open(sf_reads2, "w") as fout_polya:
            for entry in fh:
                len_seq=len(entry.sequence)
                if len_seq<global_values.BWA_REALIGN_CUTOFF:
                    fout_polya.write(str(entry)+"\n")
                else:
                    fout_non.write(str(entry)+"\n")
        sf_sam1=sf_out_sam+".non_polyA.sam"
        self.realign_clipped_reads(sf_ref, sf_reads1, sf_sam1)
        sf_sam2 = sf_out_sam + ".polyA.sam"
        self.realign_clipped_polyA(sf_ref, sf_reads2, sf_sam2)

        #merge the alignment
        with open(sf_out_sam,"w") as fout_sam:
            with open(sf_sam1) as fin_sam1:
                for line in fin_sam1:
                    fout_sam.write(line)
            with open(sf_sam2) as fin_sam2:
                for line in fin_sam2:
                    if line[0]=="@":
                        continue
                    fout_sam.write(line)
        os.remove(sf_reads1)
        os.remove(sf_reads2)
        os.remove(sf_sam1)
        os.remove(sf_sam2)
Exemple #28
0
def fastq_reader(filename, chunk_size, show_progress=True):
    fastq = pysam.FastxFile(filename, persist=False)

    while True:
        ids = []
        seqs = []
        quals = []

        for read in tqdm(fastq, unit = 'reads', unit_scale=True, disable=not show_progress):            
            ids.append(read.name)
            seqs.append(read.sequence)
            quals.append(read.quality)
            if (len(ids) >= chunk_size):
                break
        
        if (not ids):
            break

        ids = numpy.array(ids, dtype=numpy.object)        
        seqs = numpy.array(seqs)                
        seqs = seqs.view('U1').reshape((seqs.size, -1))
        quals = numpy.array(quals)
        quals = quals.view(numpy.uint8).reshape((quals.size, -1))

        yield ids, seqs, quals
 def _fastxIter(self, fastx, track=True, **kwargs):
     for rec in pysam.FastxFile(fastx):
         kind = self._classifyFq(rec)
         if track:
             self.counter[kind] += 1
         if kind == 'pass':
             yield SimpleRecord(rec.name, rec.sequence)
Exemple #30
0
def filter_fastq(logfp, infile, outfile, length, quality):
    try:
        fin = ps.FastxFile(infile)
    except:
        logfp.write('Error in parsing read file %s\n' % (infile))
        sys.exit(IOError)
    try:
        fout = open(outfile, 'w')
    except:
        logfp.write('Error in opening output read file %s\n' % (outfile))
        sys.exit(IOError)

    utils.format_time(logfp, __name__, 'Filtering reads\n')

    cnt = 0
    for entry in fin:
        if len(entry.sequence) >= length:
            if entry.quality is not None:
                nq = []
                for q in entry.quality:
                    nq.append(ord(q) - 33)
                if np.average(nq) >= quality:
                    fout.write(str(entry) + '\n')
                    cnt += 1
            else:
                fout.write(str(entry) + '\n')
                cnt += 1
    utils.format_time(logfp, __name__, 'Filtered reads: %s\n' % cnt)

    fin.close()
    fout.close()