Beispiel #1
0
def main():
    for ex in ["bmftools_db", "bmftools", "bmftools_p"]:
        cstr = "../../%s hashdmp -o hashdmp_test.out hashdmp_test.fq" % ex
        subprocess.check_call(shlex.split(cstr))
        fqh = pysam.FastqFile("hashdmp_test.out")
        try:
            r1 = fqh.next()
        except:
            r1 = next(fqh)  # Python 3
        tags = get_tags(r1)
        assert tags["FM"] == 7
        assert round(tags["NF"], 2) == 0.14
        assert tags["RV"] == 2
        assert tags["DR"]
        assert len(r1.name) == 16
        try:
            r1 = fqh.next()
        except:
            r1 = next(fqh)  # Python 3
        tags = get_tags(r1)
        assert tags["FM"] == 1
        assert tags["FP"] == 0
        assert tags["DR"] == 0


    return
Beispiel #2
0
def test_mean_qscore(read_fastq_table, read_fastq_file):
    """Check the mean qscore against that produced by seqkit"""
    expected = pd.read_csv(read_fastq_table,
                           sep="\t",
                           usecols=["read_id", "mean_qscore"],
                           index_col="read_id",
                           squeeze=True)
    for rec in pysam.FastqFile(read_fastq_file):
        qscore = mean_qscore(rec.get_quality_array())
        assert np.around(qscore, 2) == expected[rec.name]
Beispiel #3
0
def main():
    for ex in ["bmftools_db", "bmftools", "bmftools_p"]:
        cstr = (
            "../../%s collapse inline -wn0 -sTGACT -t%i -o marksplit_test_tmp -l 10 "
            "-v 11 marksplit_test.R1.fq marksplit_test.R2.fq" %
            (ex, mm_threshold))
        subprocess.check_call(shlex.split(cstr))
        for read in pysam.FastqFile("marksplit_test_tmp.tmp.0.R1.fastq"):
            check_bc(read)
    return 0
Beispiel #4
0
    def add_edges_fastq(self):
        with pysam.FastqFile(self.sequence) as file:
            for read in file:
                sequence = read.sequence

                for i in range(len(sequence) - self.k):
                    kmer = sequence[i:i + self.k + 1]
                    rev_kmer = self.get_complementary_sequence(kmer)
                    self.add_edge(kmer)
                    self.add_edge(rev_kmer)
Beispiel #5
0
 def test_output(self):
     """Compare output fastq to expected sequence and scores."""
     expected = {
         'input1': ('AGTGCTCA', (1, 1, 3, 2, 1, 1, 5, 1)),
         'input2': ('ACTC', (3, 1, 3, 4))}
     got = {}
     with pysam.FastqFile(self.args.output) as output_handle:
         for read in output_handle:
             lengths = tuple(read.get_quality_array())
             got[read.name] = (read.sequence, lengths)
     self.assertEqual(expected, got)
Beispiel #6
0
    def add_vertices_fastq(self):
        with pysam.FastqFile(self.sequence) as file:
            for read in file:
                sequence = read.sequence

                for i in range(len(sequence) - self.k + 1):
                    kmer = sequence[i:i + self.k]
                    rev_kmer = self.get_complementary_sequence(kmer)
                    if not (kmer in self.vertices):
                        self.add_vertex(kmer)
                        self.add_vertex(rev_kmer)
Beispiel #7
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-fastq-file",
                      dest="input_fastq_file",
                      type="string",
                      help="input fastq file. "
                      "[%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      action="append",
                      type="choice",
                      choices=("length", ),
                      help="methods to apply [%default]")

    parser.set_defaults(
        methods=[],
        input_fastq_file=None,
    )

    (options, args) = E.start(parser, argv)

    if len(args) == 1:
        options.input_fastq_file = args[0]

    if options.input_fastq_file is None:
        raise ValueError("missing input fastq file")

    counter = E.Counter()

    # note: complete rewrite with Counters, currently only length
    if options.methods != ["length"]:
        raise NotImplementedError()

    with pysam.FastqFile(options.input_fastq_file) as inf:

        for read in inf:
            counter.input += 1
            options.stdout.write(
                "\t".join(map(str, (read.name, len(read.sequence)))) + "\n")

            counter.output += 1

    E.info(counter)
    E.stop()
Beispiel #8
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-i",
                        "--input-fastq-file",
                        dest="input_fastq_file",
                        type=str,
                        help="input fastq file. ")

    parser.add_argument("-m",
                        "--method",
                        dest="methods",
                        action="append",
                        type=str,
                        choices=("length", ),
                        help="methods to apply ")

    parser.set_defaults(
        methods=[],
        input_fastq_file=None,
    )

    (args, unknown) = E.start(parser, argv, unknowns=True)

    if len(unknown) == 1:
        args.input_fastq_file = unknown[0]

    if args.input_fastq_file is None:
        raise ValueError("missing input fastq file")

    counter = E.Counter()

    # note: complete rewrite with Counters, currently only length
    if args.methods != ["length"]:
        raise NotImplementedError()

    with pysam.FastqFile(args.input_fastq_file) as inf:

        for read in inf:
            counter.input += 1
            args.stdout.write(
                "\t".join(map(str, (read.name, len(read.sequence)))) + "\n")

            counter.output += 1

    E.info(counter)
    E.stop()
Beispiel #9
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-i",
                        "--input-fastq",
                        dest="input_fastq_file",
                        type=str,
                        help="input fastq file")

    parser.add_argument("-m",
                        "--method",
                        dest="method",
                        type=str,
                        choices=["ont2pacbio"],
                        help="methods to apply ")

    parser.set_defaults(
        input_fastq_file=None,
        line_width=80,
        method=None,
    )

    (args, unknown) = E.start(parser,
                              argv,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) == 1:
        args.input_fastq_file = unknown[0]

    if args.input_fastq_file == "-":
        args.input_fastq_file = args.stdin

    outf = args.stdout
    line_width = args.line_width
    well_no = 0
    for record in pysam.FastqFile(args.input_fastq_file):
        well_no += 1
        quals = record.get_quality_array()
        seq = record.sequence
        qv = int(math.floor(sum(quals) / len(quals)))
        outf.write(">{}/{}/{}_{} RQ=0.{}\n".format("test", well_no, 1,
                                                   len(seq) + 1, qv))
        for x in range(0, len(seq), line_width):
            outf.write(seq[x:x + line_width] + "\n")

    E.stop()
Beispiel #10
0
def base_quality_single_threaded(fastq, pkl):
  """Given a fastq file, read through the file

  :param fastq: name of fastq file
  :param pkl: name of pickle file to write to
  :return:
  """
  #                 bp, phred
  bq_mat = np.zeros((500, 100), dtype=np.uint64)
  idx = list(range(500))
  for r in pysam.FastqFile(filename=fastq, persist=False):
    bqa = r.get_quality_array()
    bq_mat[idx[:len(bqa)], bqa] += 1

  pickle.dump(bq_mat, open(pkl, 'wb'))

  return bq_mat
Beispiel #11
0
    def test_rle(self):
        """Test the conversion of basecalls into fastqrle file."""

        block_size = 3
        with open(self.output_fastqrle, 'w') as f:
            subprocess.call(['medaka', 'fastrle', self.input_fasta, '--block_size', str(block_size)], stdout=f)

        expected_results = (
            [('A', 1), ('C', 3), ('C', 3), ('C', 1), ('G', 1), ('T', 3), ('A', 1)], 
            [('C', 3), ('C', 3), ('C', 2)])
  
        with pysam.FastqFile(self.output_fastqrle) as f:
            for index, entry in enumerate(f):
                bases = entry.sequence
                qualities = entry.get_quality_array()
                got = list(zip(bases, qualities))
                expected = expected_results[index]
                self.assertEqual(expected, got, "Expected and got differ: ({} != {})".format(expected, got))
    def run(self):
        ## do the extraction

        ofh = open(self.output_fq_filename, 'w')

        fq_reader = pysam.FastqFile(self.input_fq_filename)
        for fq_entry in fq_reader:
            read_name = fq_entry.name
            read_name = re.sub("/[12]$", "", read_name)
            if read_name in self.keep_set:
                ofh.write("\n".join([
                    "@" +
                    fq_entry.name, fq_entry.sequence, "+", fq_entry.quality
                ]) + "\n")

        ofh.close()

        self.success = True
Beispiel #13
0
def process_one_fastq(fastq, threads=2, max_reads_in_queue=int(30e6)):
  """

  :param fastq:
  :param threads:
  :param max_reads_in_queue: The default is about 6GB, considering 200 bytes per qual string
  :return:
  """

  t0 = time.time()
  in_queue, out_queue = Queue(max_reads_in_queue), Queue()

  # Start worker processes
  logger.debug('Starting {} threads'.format(threads))
  p_list = [Process(target=process_worker, args=(i, in_queue, out_queue)) for i in range(threads)]
  for p in p_list:
    p.start()

  # Burn through file
  logger.debug('Starting to read FASTQ file')
  for r in pysam.FastqFile(filename=fastq, persist=False):
    bqa = r.get_quality_array()
    in_queue.put(bqa)  # TODO: Block when queue gets too big

  # Tell child processes to stop
  logger.debug('Telling child processes to stop')
  for i in range(threads):
    in_queue.put(__process_stop_code__)

  # Get results and add them
  logger.debug('Summing up result matrices')
  bq_mat = out_queue.get()
  for i in range(threads - 1):
    bq_mat += out_queue.get()

  # Wait for workers to finish
  logger.debug('Waiting for workers to shutdown')
  for p in p_list:
    p.join()

  t1 = time.time()
  logger.debug('Finished processing FASTQ in {} s'.format(t1 - t0))

  return bq_mat
Beispiel #14
0
def main():
    subprocess.check_call(
        "../../bmftools_db rsq -ftmp.fq rsq_test.bam rsq_test.out.bam 2> rsq_test.log",
        shell=True)
    try:
        assert (subprocess.check_output("samtools view -c rsq_test.out.bam",
                                        shell=True).strip() == "0")
    except AssertionError:
        assert (subprocess.check_output("samtools view -c rsq_test.out.bam",
                                        shell=True).strip().decode() == "0")
    recs = list(pysam.FastqFile("tmp.fq"))
    assert len(recs) == 2
    try:
        assert str(recs[0]) == correct_string
        return 0
    except AssertionError:
        sys.stderr.write("%s found not expected %s. TEST FAILED\n" %
                         (repr(str(recs[0])), repr(correct_string)))
        return 1
Beispiel #15
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-i", "--input-fastq", dest="input_fastq_file", type="string",
        help="input fastq file")

    parser.add_option(
        "-m", "--method", dest="method", type="choice",
        choices=["ont2pacbio"],
        help="methods to apply [%default]")

    parser.set_defaults(
        input_fastq_file=None,
        line_width=80,
        method=None,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) == 1:
        options.input_fastq_file = args[0]

    if options.input_fastq_file == "-":
        options.input_fastq_file = options.stdin

    outf = options.stdout
    line_width = options.line_width
    well_no = 0
    for record in pysam.FastqFile(options.input_fastq_file):
        well_no += 1
        quals = record.get_quality_array()
        seq = record.sequence
        qv = int(math.floor(sum(quals) / len(quals)))
        outf.write(">{}/{}/{}_{} RQ=0.{}\n".format(
            "test", well_no, 1, len(seq) + 1, qv))
        for x in range(0, len(seq), line_width):
            outf.write(seq[x:x + line_width] + "\n")

    E.stop()
Beispiel #16
0
def main():

    with gzip.open(args.output, 'wb') as w:

        cut_site = re.compile(args.cutsite)
        cutsite_counter = 0
        total_fragments = 0
        fragment_counts = dict()

        for counter, record in enumerate(pysam.FastqFile(args.input_fn)):

            fragments = cut_site.split(record.sequence)
            lengths = [len(frag) for frag in fragments if not frag == '']
            
            if len(lengths) not in fragment_counts:
                fragment_counts[len(lengths)] = 0
                       
            fragment_counts[len(lengths)] += 1

            if len(fragments) > 1:
                cutsite_counter += 1

                last_slice = 0
                for ii, length in enumerate(lengths):
                    
                    total_fragments += 1

                    current_slice = last_slice + length

                    w.write(f'@{record.name}:PE1:{ii}\n'.encode())
                    w.write(f'{record.sequence[last_slice:current_slice]}\n'.encode())
                    w.write('+\n'.encode())
                    w.write(f'{record.quality[last_slice:current_slice]}\n'.encode())

                    last_slice = current_slice

    with open(args.logfile, 'w') as w:
        w.write(f'Records processed: {counter}')
        w.write(f'Records with cutsites: {cutsite_counter}')
        w.write(f'Fragments output: {total_fragments}')
        for k, v in fragment_counts.items():
            w.write(f'Fragments {k}: {v}')
    def run(self):
        ## do the extraction

        ofh = open(self.output_fq_filename, 'w')

        fq_reader = pysam.FastqFile(self.input_fq_filename)
        for fq_entry in fq_reader:
            read_name = fq_entry.name
            read_name = re.sub("/[12]$", "", read_name)
            if read_name not in self.keep_set:

                ofh.write(str(fq_entry) + "\n")  # retains original formatting.

                #ofh.write( "\n".join(["@" + fq_entry.name,
                #                      fq_entry.sequence,
                #                      "+",
                #                      fq_entry.quality]
                #                     ) + "\n")

        ofh.close()

        self.success = True
Beispiel #18
0
def main():
    parser = None
    try:
        parser = optparse.OptionParser(usage=usage, description=description)
        for opt in opts:
            if len(opt) == 4:
                parser.add_option(opt[0], opt[1], help=opt[2], **opt[3])
            elif len(opt) == 3:
                parser.add_option(opt[0], help=opt[1], **opt[2])
        (opt, args) = parser.parse_args()

        if not (opt.input and os.path.exists(opt.input)):
            raise Usage("Please provide a fastq file")

        if opt.debug:
            print("""
fastqToFasta.py
i=%s
n=%i
x=%i
""" % (opt.input, n, x))

        fq = pysam.FastqFile(opt.input)
        faFile = opt.output or unique_filename_in()
        rlen = int(opt.length)
        rskip = int(opt.start) - 1
        fa = open(faFile, "w")
        for i, s in enumerate(fq):
            seq = s.sequence[rskip:(rskip + rlen)]
            header = "_".join([s.name, s.sequence, s.quality])
            fa.write(">" + header + "\n" + seq + "\n")
        fq.close()
        fa.close()

    except Usage, err:
        print >> sys.stderr, '\n', err.msg, '\n'
        if parser: parser.print_help()
        return 1
import argparse
import pysam

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("fastq")
    parser.add_argument("--proportion_of_Ns_allowed", type=float, default=0.5)
    args = parser.parse_args()

    fh = pysam.FastqFile(args.fastq)
    for record in fh:
        if record.sequence.count("N") < args.proportion_of_Ns_allowed * len(
                record.sequence):
            print "@%s" % record.name
            print record.sequence
            print "+"
            print record.quality

    fh.close()
Beispiel #20
0
#!/usr/bin/env python3

import argparse
import pysam

if __name__ == "__main__":

    # Get arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('fastq')
    parser.add_argument('--max_mask_prop', type=float, default=0.5)

    args = parser.parse_args()

    with pysam.FastqFile(args.fastq) as fh:
        for record in fh:
            if record.sequence.count('N') < args.max_mask_prop * len(
                    record.sequence):
                print('@%s' % record.name)
                print(record.sequence)
                print('+')
                print(record.quality)
Beispiel #21
0
def main():
    # Read parameters
    config = Config()

    # Parse the inputs args/options
    parser = argparse.ArgumentParser(
        usage="target_fasta query_fastq [options]", version="%prog 0.1")

    parser.add_argument("target_fasta",
                        type=str,
                        help="The target genome fasta file.")
    parser.add_argument("query_fastq", type=str, help="The query sequences.")

    parser.add_argument("--w",
                        dest="w",
                        help="Length of minimizer window. Default=%s" %
                        config.w,
                        default=config.w)
    parser.add_argument("--k",
                        dest="k",
                        help="Length of k-mer. Default=%s" % config.k,
                        default=config.k)
    parser.add_argument("--t",
                        dest="t",
                        help="Discard minmers that occur more frequently "
                        "in the target than t. Default=%s" % config.w,
                        default=config.w)
    parser.add_argument(
        "--l",
        dest="l",
        help="Cluster two minmers into the same cluster if within l bases of"
        " each other in both target and query. Default=%s" % config.l,
        default=config.l)
    parser.add_argument(
        "--column",
        dest="column",
        help=
        "Add this many bases to the prefix and suffix of a seed cluster in the"
        " target and query sequence. Default=%s" % config.column,
        default=config.column)
    parser.add_argument("--gapScore",
                        dest="gapScore",
                        help="Smith-Waterman gap-score. Default=%s" %
                        config.gapScore,
                        default=config.gapScore)
    parser.add_argument("--matchScore",
                        dest="matchScore",
                        help="Smith-Waterman match-score. Default=%s" %
                        config.gapScore,
                        default=config.gapScore)
    parser.add_argument("--mismatchScore",
                        dest="mismatchScore",
                        help="Smith-Waterman mismatch-score. Default=%s" %
                        config.mismatchScore,
                        default=config.mismatchScore)
    parser.add_argument("--log",
                        dest="logLevel",
                        help="Logging level. Default=%s" % config.logLevel,
                        default=config.logLevel)

    options = parser.parse_args()

    # Parse the log level
    numeric_level = getattr(logging, options.logLevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.logLevel)

    # Setup a logger
    logger.setLevel(numeric_level)
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(numeric_level)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    logger.debug("Established logger")

    startTime = time.time()

    # Parse the target sequence and read the first sequence
    with pysam.FastaFile(options.target_fasta) as targetFasta:
        targetString = targetFasta.fetch(targetFasta.references[0])
    logger.info("Parsed target string. Length: %s" % len(targetString))

    # Build minimizer index
    minimizerIndex = MinimizerIndexer(targetString.upper(),
                                      w=options.w,
                                      k=options.k,
                                      t=options.t)
    minmerInstances = sum(map(len, minimizerIndex.minimizerMap.values()))
    logger.info(
        "Built minimizer index in %s seconds. #minmers: %s, #minmer instances: %s"
        % ((time.time() - startTime), len(
            minimizerIndex.minimizerMap), minmerInstances))

    # Open the query files
    alignmentScores = []  # Array storing the alignment scores found
    with pysam.FastqFile(options.query_fastq) as queryFastq:
        # For each query string build alignment
        for query, queryIndex in zip(queryFastq, xrange(sys.maxint)):
            print
            queryIndex
            alignment = simpleMap(targetString, minimizerIndex,
                                  query.sequence.upper(), config)
            alignmentScore = 0 if alignment is None else alignment.getMaxAlignmentScore(
            )
            alignmentScores.append(alignmentScore)
            logger.debug(
                "Mapped query sequence #%i, length: %s alignment_found?: %s "
                "max_alignment_score: %s" % (queryIndex, len(
                    query.sequence), alignment is not None, alignmentScore))
            # Comment this out to test on a subset
            # if queryIndex > 100:
            #    break

    # Print some stats
    logger.critical(
        "Finished alignments in %s total seconds, average alignment score: %s"
        % (time.time() - startTime,
           float(sum(alignmentScores)) / len(alignmentScores)))
Beispiel #22
0
def main():
    # Read parameters
    config = Config()
    
    #Parse the inputs args/options
    parser = argparse.ArgumentParser(usage="target_fasta query_fastq [options]") # , version="%prog 0.1")

    parser.add_argument("target_fasta", type=str,
                        help="The target genome fasta file.")
    parser.add_argument("query_fastq", type=str,
                        help="The query sequences.")
    parser.add_argument("--g", dest="g", help="Use Numba cuda.jit kernel to parallelize MinimizerIndexer on GPU", action='store_true')
    parser.add_argument("--w", dest="w", type=int, help="Length of minimizer window. Default=%s" % config.w, default=config.w)
    parser.add_argument("--k", dest="k", type=int, help="Length of k-mer. Default=%s" % config.k, default=config.k)
    parser.add_argument("--t", dest="t", type=int, help="Discard minmers that occur more frequently " 
                                            "in the target than t. Default=%s" % config.t, default=config.t)
    parser.add_argument("--l", dest="l", type=int, help="Cluster two minmers into the same cluster if within l bases of"
                                            " each other in both target and query. Default=%s" % config.l, default=config.l)
    parser.add_argument("--c", dest="c", type=int, help="Add this many bases to the prefix and suffix of a seed cluster in the"
                                            " target and query sequence. Default=%s" % config.c, default=config.c)
    parser.add_argument("--gapScore", type=float, dest="gapScore", help="Smith-Waterman gap-score. Default=%s" % 
                      config.gapScore, default=config.gapScore)
    parser.add_argument("--matchScore", type=float, dest="matchScore", help="Smith-Waterman match-score. Default=%s" % 
                      config.gapScore, default=config.gapScore)
    parser.add_argument("--mismatchScore", type=float, dest="mismatchScore", help="Smith-Waterman mismatch-score. Default=%s" % 
                      config.mismatchScore, default=config.mismatchScore)
    parser.add_argument("--log", dest="logLevel", help="Logging level. Default=%s" % 
                      config.logLevel, default=config.logLevel)
    
    options = parser.parse_args()
    
    # Parse the log level
    numeric_level = getattr(logging, options.logLevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.logLevel)
    
    # Setup a logger
    logger.setLevel(numeric_level)
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(numeric_level)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    logger.debug("Established logger")
    
    startTime = time.time()
    global targetString
    
    # Parse the target sequence and read the first sequence
    with pysam.FastaFile(options.target_fasta) as targetFasta:
        targetString = targetFasta.fetch(targetFasta.references[0])
    logger.info("Parsed target string. Length: %s in %s seconds" % (len(targetString), time.time()-startTime))
    
    # Build minimizer index
    minimizerIndex = MinimizerIndexer(targetString.upper(), w=options.w, k=options.k, t=options.t)
    # print("minimizerIndex attributes", list(minimizerIndex.minimizerMap.items())[:10], 
    #       list(minimizerIndex.minmerOccurrences.items())[:10])
    
    # Only seeing this many minmers for the target DNA sequence:
    print(len(minimizerIndex.minimizerMap.keys()), "minimizerMap keys", list(minimizerIndex.minimizerMap.keys())[:20],
          "\n", len(minimizerIndex.minmerOccurrences.keys()), "minmerOccurrences keys", list(minimizerIndex.minmerOccurrences.keys())[:20])
    
    minmerInstances = sum(map(len, minimizerIndex.minimizerMap.values()))
    logger.info("Built minimizer index in %s seconds. #minmers: %s, #minmer instances: %s" %
                 ((time.time()-startTime), len(minimizerIndex.minimizerMap), minmerInstances))
    
    # Open the query files
    alignmentScores = [] # Array storing the alignment scores found
    threads = []
    with pysam.FastqFile(options.query_fastq) as queryFastq: #, Pool(10) as p:
        # For each query string build alignment
        if options.g:
            # For each query string build alignment
            for query, queryIndex in zip(queryFastq, range(sys.maxsize)): # xrange(sys.maxint)):
                ## print queryIndex
                print(queryIndex)
                alignment = simpleMap(minimizerIndex, query.sequence.upper(), config, None, options.g)
                alignmentScore = 0 if alignment is None else alignment.getMaxAlignmentScore()
                alignmentScores.append(alignmentScore)
                logger.info("Mapped query sequence #%i, length: %s alignment_found?: %s "
                            "max_alignment_score: %s" % 
                            (queryIndex, len(query.sequence), alignment is not None, alignmentScore)) 
        else:
            results = list()
            q = Queue()
            for query, queryIndex in zip(queryFastq, range(sys.maxsize)): # xrange(sys.maxint)):
                print("Reading query", queryIndex)
                results.append((queryIndex, query.sequence))
                p = Process(target=simpleMap, args=(minimizerIndex, query.sequence.upper(), config, q, options.g))
                p.Daemon = True
                p.start()
            for r in results:
                queryIndex = r[0]
                querySeq = r[1]
                alignment = q.get()
                try:
                    alignmentScore = alignment.getMaxAlignmentScore()
                except:
                    print("None type, continue")
                    continue
                # print("Query joined", queryIndex)
                alignmentScores.append(alignmentScore)
                logger.info("Mapped query sequence #%i, length: %s alignment_found?: %s "
                            "max_alignment_score: %s" % 
                            (queryIndex, len(querySeq), alignment is not None, alignmentScore))  

            for t in threads:
                p.join()
                   
    logger.info("Finished alignments in %s total seconds, average alignment score: %s" % 
                    (time.time()-startTime, float(sum(alignmentScores))/len(alignmentScores)))
Beispiel #23
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Split reads from FASTQ or BAM into fragments of a given size and output fragments as FASTQ records")
    parser.add_argument("input", help="FASTQ or BAM with sequences to fragment")
    parser.add_argument("window", type=int, help="length to fragment each sequence to")
    parser.add_argument("--slide", type=int, default=0, help="length to slide the given window size across the input sequences")
    parser.add_argument("--full_length_only", action="store_true", help="omit sequences that are shorter than the requested window size")
    parser.add_argument("--read_counts", default=None, help="File to write read counts to")
    parser.add_argument("--clone_name", default="dummy", help="Name of clone for read_counts file")
    args = parser.parse_args()

    if args.input.endswith(".bam"):
        input_file = pysam.AlignmentFile(args.input, check_header=False, check_sq=False)
        is_bam = True
    else:
        input_file = pysam.FastqFile(args.input)
        is_bam = False

    for rcount, record in enumerate(input_file):
        if is_bam:
            record_name = "%s_%s" % (record.qname, (record.is_read1 and "1" or "2"))
            sequence = record.seq
            quality = record.qual
        else:
            record_name = record.name.replace("/", "_")
            sequence = record.sequence
            quality = record.quality

        sequences = fragment_sequence(sequence, args.window, args.slide)
        qualities = fragment_sequence(quality, args.window, args.slide)
Beispiel #24
0
def run(args):
    R1file = args.i  # R1 (sequence read) file
    R2file = args.i.replace('R1', 'R2')  # R2 (BC1) file
    R3file = args.i.replace('R1', 'R3')  # R3 (BC2 + UMI) file
    outdir = args.d  # output directory
    if outdir == None:
        outdir = os.path.dirname(
            R1file
        )  # if not provided, put the output file in the same directory as the input
    outbase = args.o  # output file basename
    outFile = os.path.join(outdir, '%s_R1_valid.fastq' % outbase)
    bcFile = args.b  # valid barcode file
    umi_len = args.u  # umi length

    # load the valid barcode dictionary:
    fIn = open(bcFile, 'r')
    bcSet = {}
    while 1:
        line = fIn.readline()
        if not line:
            break
        # skip the header line:
        if line.startswith('well'):
            continue
        if line.endswith('\n'):
            line = line[:-1]

        fields = line.split('\t')

        # forward barcode (trimmed to BC1_LEN bases:
        bcFwd = fields[0][:BC1_LEN]
        bcSet.setdefault(bcFwd, 0)

    fIn.close()

    ## storage for the output file pointer and statistics counters:
    samp = {}
    samp['name'] = outbase
    oFile = open(outFile, 'w')  # open outut file for this sample
    samp['file'] = oFile
    # initialize counters:
    samp['total'] = 0  # total reads
    samp['SBC'] = 0  # sample barcode corrected
    samp['valid'] = 0  # total valid reads
    samp['BC1v'] = 0  # valid BC1
    samp['BC2v'] = 0  # valid BC2
    samp['UMIv'] = 0  # valid UMI
    samp['BC1c'] = 0  # corrected BC1
    samp['BC2c'] = 0  # corrected BC2

    # open the input files:
    fq1 = pysam.FastqFile(R1file)
    fq2 = pysam.FastqFile(R2file)
    #fq3 = pysam.FastqFile(R3file)
    fq3 = pysam.FastqFile(R3file)

    # counters:
    nBc1Valid = 0
    nBc2Valid = 0
    nBc1Corr = 0
    nBc2Corr = 0
    nUmiValid = 0

    countMod = 100000
    unassigned = 0
    rCount = 0

    unassignedBC = {}  # collect counts on unassigned barcodes

    # loop over all reads:
    while 1:
        try:
            r1 = fq1.next()  # mRNA sequence read
            r2 = fq2.next()  # BC1
            #r3 = fq3.next()     # sample index
            r3 = fq3.next()  # BC2 + UMI
            rCount += 1  # read counter
            if not rCount % countMod:
                print 'read %d' % rCount
        except StopIteration:
            break  # last item
        except:
            print 'pysam.FastqFile iterator error.'
            eFlag = True
            break

        # parse out the two halves of the cell barcode, and the UMI:
        bc1 = r2.sequence  # first half of the cell barcode
        bc2 = r3.sequence[:BC2_LEN]  # second half of the cell barcode
        umi = r3.sequence[BC2_LEN:(BC2_LEN + umi_len)]  # UMI sequence

        # check the barcodes and UMI and update counts:
        (bc1, bc2, umi, bc1Valid, bc1Corr, bc2Valid, bc2Corr,
         umiValid) = parseBarcodeAndUmiV3(bc1, bc2, umi, bcSet)
        samp['total'] += 1  # total reads
        samp['BC1v'] += bc1Valid  # valid BC1
        samp['BC2v'] += bc2Valid  # valid BC2
        samp['BC1c'] += bc1Corr  # corrected BC1
        samp['BC2c'] += bc2Corr  # corrected BC2
        samp['UMIv'] += umiValid  # valid UMI

        # write out the sequence read if bc1, bc2 and umi are all valid:
        if bc1 != None and bc2 != None and umiValid:
            samp['valid'] += 1  # total valid reads for this sample
            ## create the new read name:
            rName = '%s:%s%s:%s' % (r1.name, bc1, bc2, umi)
            fastqWrite(samp['file'], r1, rName)

    # close the input files:
    fq1.close()
    fq2.close()
    #fq3.close()
    fq3.close()

    # print counts:
    print 'Total reads: %d' % rCount
    # close the output file:
    samp['file'].close()

    # print sample-by-sample stats:
    print 'sample\ttotal\tvalid\tBC1valid\tBC1corr\tBC2valid\tBC2corr\tUMIvalid'
    x = samp
    sOut = '%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (
        x['name'], x['total'], x['valid'], x['BC1v'], x['BC1c'], x['BC2v'],
        x['BC2c'], x['UMIv'])
    print sOut

    return
Beispiel #25
0
def get_first_n(fn, n=3):
    res = collections.Counter()
    fq = pysam.FastqFile(fn, 'rb')
    for rd in fq:
        res[rd.sequence[:n]] += 1
    return res