コード例 #1
0
ファイル: fastqs2fasta.py プロジェクト: yangjl/cgat
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                             usage = globals()["__doc__"] )

    parser.add_option("-a", "--fastq1", dest="fastq1", type="string",
                      help="supply read1 fastq file"  )
    parser.add_option("-b", "--fastq2", dest="fastq2", type="string",
                      help="supply read2 fastq file"  )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    fastq1 = IOTools.openFile(options.fastq1)
    fastq2 = IOTools.openFile(options.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in itertools.izip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError("unpaired reads detected. Are files sorted? are files of equal length?")
            except PairedReadError, e:
                raise PairedReadError(e), None, sys.exc_info()[2]
        else:
            assert f1.identifier.endswith("/1") and f2.identifier.endswith("/2"), "Reads in file 1 must end with /1 and reads in file 2 with /2"
            options.stdout.write(">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1
コード例 #2
0
ファイル: fastq2solid.py プロジェクト: Charlie-George/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--change-format", dest="change_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer'),
                      help="guess quality score format and set quality scores to format [default=%default].")

    parser.add_option("--guess-format", dest="guess_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer'),
                      help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option("--pattern", dest="pattern", type="string",
                      help="filename prefix [default=%default].")

    parser.set_defaults(
        change_format=None,
        guess_format=None,
        pattern="%s.gz"
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    outfile_seq = IOTools.openFile(options.pattern % "csfasta", "w")
    outfile_qual = IOTools.openFile(options.pattern % "qual", "w")

    if options.change_format:
        iter = Fastq.iterate_convert(options.stdin,
                                     format=options.change_format,
                                     guess=options.guess_format)
    else:
        iter = Fastq.iterate(options.stdin)

    for record in iter:
        c.input += 1
        outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq))
        outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals))
        c.output += 1

    outfile_seq.close()
    outfile_qual.close()

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
コード例 #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--first-fastq-file",
                      dest="fastq1",
                      type="string",
                      help="supply read1 fastq file")
    parser.add_option("-b",
                      "--second-fastq-file",
                      dest="fastq2",
                      type="string",
                      help="supply read2 fastq file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if args and len(args) == 2:
        options.fastq1, options.fastq2 = args

    fastq1 = IOTools.open_file(options.fastq1)
    fastq2 = IOTools.open_file(options.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in zip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError(
                    "unpaired reads detected. Are files sorted? are "
                    "files of equal length?")
            except PairedReadError as e:
                raise PairedReadError(e).with_traceback(sys.exc_info()[2])
        else:
            assert f1.identifier.endswith("/1") and \
                f2.identifier.endswith("/2"), \
                "Reads in file 1 must end with /1 and reads in file 2 with /2"
            options.stdout.write(
                ">%s\n%s\n>%s\n%s\n" %
                (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1

    E.info("output: %i pairs" % f1_count)

    # write footer and output benchmark information.
    E.stop()
コード例 #4
0
ファイル: fastqs2fasta.py プロジェクト: CGATOxford/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a", "--first-fastq-file", dest="fastq1", type="string",
        help="supply read1 fastq file")
    parser.add_option(
        "-b", "--second-fastq-file", dest="fastq2", type="string",
        help="supply read2 fastq file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if args and len(args) == 2:
        options.fastq1, options.fastq2 = args

    fastq1 = IOTools.openFile(options.fastq1)
    fastq2 = IOTools.openFile(options.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in zip_longest(Fastq.iterate(fastq1),
                              Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError(
                    "unpaired reads detected. Are files sorted? are "
                    "files of equal length?")
            except PairedReadError as e:
                raise PairedReadError(e).with_traceback(sys.exc_info()[2])
        else:
            assert f1.identifier.endswith("/1") and \
                f2.identifier.endswith("/2"), \
                "Reads in file 1 must end with /1 and reads in file 2 with /2"
            options.stdout.write(
                ">%s\n%s\n>%s\n%s\n" %
                (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1

    E.info("output: %i pairs" % f1_count)

    # write footer and output benchmark information.
    E.Stop()
コード例 #5
0
def buildTrueTaxonomicRelativeAbundances(infiles, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    '''
    levels = ["species", "genus", "family", "order", "class", "phylum"]
    taxa = open(infiles[1])
    header = taxa.readline()
    gi2taxa = collections.defaultdict(list)
    for line in taxa.readlines():
        data = line[:-1].split("\t")
        gi, strain, species, genus, family, order, _class, phylum = data[
            0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]
        gi2taxa[gi] = (species, genus, family, order, _class, phylum)

    outf = open(outfile, "w")
    outf.write("level\ttaxa\trelab\n")
    for i in range(len(levels)):
        total = 0
        result = collections.defaultdict(int)
        for fastq in Fastq.iterate(IOTools.openFile(infiles[0])):
            total += 1
            gi = fastq.identifier.split("|")[1]
            result[gi2taxa[gi][i]] += 1
        for taxa, value in result.iteritems():
            outf.write("%s\t%s\t%s\n" %
                       (levels[i], taxa, float(value) / total))
    outf.close()
コード例 #6
0
    def build(self, infiles, outfiles, output_prefix):

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        outdir = os.path.join(output_prefix + ".dir")
        track = os.path.basename(output_prefix)

        processing_options = self.processing_options

        infile1, infile2 = infiles
        outfile = outfiles[0]

        cmd = '''flash %(infile1)s %(infile2)s
        -p %(offset)s
        %(processing_options)s
        -o %(track)s
        -d %(outdir)s
        >& %(output_prefix)s-flash.log;
        checkpoint;
        gzip %(outdir)s/*;
        checkpoint;
        mv %(outdir)s/%(track)s.extendedFrags.fastq.gz %(outfile)s;
        ''' % locals()

        return cmd
コード例 #7
0
    def build(self, infiles, outfiles, output_prefix):

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        outdir = os.path.join(output_prefix + ".dir")
        track = os.path.basename(output_prefix)

        processing_options = self.processing_options
        threads = self.threads

        infile1, infile2 = infiles
        outfile = outfiles[0]

        cmd = '''pandaseq -f %(infile1)s -r %(infile2)s
        %(processing_options)s
        -T %(threads)i
        -U >(gzip > %(outfile)s.unpaired.gz)
        -w >(gzip > %(outfile)s)
        -F
        -G %(output_prefix)s-pandaseq.log.bgz;
        >& %(output_prefix)s-pandaseq.log;
        checkpoint;
        gzip %(outdir)s/*;
        checkpoint;
        ''' % locals()

        return cmd
コード例 #8
0
def buildTrueTaxonomicRelativeAbundances(infiles, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    '''
    levels = ["species", "genus", "family", "order", "class", "phylum"]
    taxa = open(infiles[1])
    header = taxa.readline()
    gi2taxa = collections.defaultdict(list)
    for line in taxa.readlines():
        data = line[:-1].split("\t")
        gi, strain, species, genus, family, order, _class, phylum = data[
            0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]
        gi2taxa[gi] = (species, genus, family, order, _class, phylum)

    outf = open(outfile, "w")
    outf.write("level\ttaxa\trelab\n")
    for i in range(len(levels)):
        total = 0
        result = collections.defaultdict(int)
        for fastq in Fastq.iterate(IOTools.openFile(infiles[0])):
            total += 1
            gi = fastq.identifier.split("|")[1]
            result[gi2taxa[gi][i]] += 1
        for taxa, value in result.iteritems():
            outf.write("%s\t%s\t%s\n" %
                       (levels[i], taxa, float(value) / total))
    outf.close()
コード例 #9
0
def filterReadsByPrimerMatch(infile, outfiles):
    '''Filter out reads where the start of read 1 does not match primer sequence (14bp)'''
    to_cluster = True
    primer = "a"
    if infile.find("_b.") > 0:
        primer = "b"
    if primer == "a":
        primer_seq = PARAMS["grep_primer_a"]
    else:
        primer_seq = PARAMS["grep_primer_b"]
    grep_filter_length = PARAMS["grep_filter_length"]
    primer_subseq = primer_seq[:grep_filter_length]

    track = P.snip(os.path.basename(infile), ".fastq.1.gz")
    infile2 = track + ".fastq.2.gz"
    outfile1, outfile2 = outfiles
    tempfile = "filtered/" + track + ".filtered.fastq.1.gz"

    # filter by primer match
    fastq_in = open(infile, "r")
    fastq_out = open(tempfile, "wb")
    for read in fq.iterate(fastq_in):
        if read.seq[:grep_filter_length] == primer_subseq:
            fastq_out.writeln("@" + read.id)
            fastq_out.writeln(read.seq)
            fastq_out.writeln("+")
            fastq_out.writeln(read.qual)
    fastq_in.close()
    fastq_out.close()

    # reconcile read pairs
    statement = '''python %(scriptsdir)s/fastqs2fastq.py --method=reconcile %(tempfile)s %(infile2)s --output-filename-pattern=filtered/%(track)s.reconciled.fastq.%%i.gz'''
    P.run()
コード例 #10
0
    def build(self, infiles, outfiles, output_prefix):

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        outdir = os.path.join(output_prefix + ".dir")
        track = os.path.basename(output_prefix)

        processing_options = self.processing_options
        threads = self.threads

        infile1, infile2 = infiles
        outfile = outfiles[0]

        cmd = '''pandaseq -f %(infile1)s -r %(infile2)s
        %(processing_options)s
        -T %(threads)i
        -U >(gzip > %(outfile)s.unpaired.gz)
        -w >(gzip > %(outfile)s)
        -F
        -G %(output_prefix)s-pandaseq.log.bgz;
        >& %(output_prefix)s-pandaseq.log;
        checkpoint;
        gzip %(outdir)s/*;
        checkpoint;
        ''' % locals()

        return cmd
コード例 #11
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options
        r = {33: 'sanger', 64: 'illumina', 59: 'solexa'}
        quality = r[offset]

        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]
            cmd = '''sickle se
            -g %(processing_options)s
            --qual-type %(quality)s
            --output-file %(outfile)s
            --fastq-file %(infile)s
            2>>%(output_prefix)s.log
            ;''' % locals()

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles
            cmd = '''sickle pe
            -g -s %(processing_options)s
            --qual-type %(quality)s
            -f %(infile1)s -r %(infile2)s
            -o %(outfile1)s -p %(outfile2)s
            2>>%(output_prefix)s.log
            ;''' % locals()

        return cmd
コード例 #12
0
def filterReadsByPrimerMatch(infile, outfiles):
    '''Filter out reads where the start of read 1 does not match primer sequence (14bp)'''
    to_cluster = True
    primer = "a"
    if infile.find("_b.") > 0:
        primer = "b"
    if primer == "a":
        primer_seq = PARAMS["grep_primer_a"]
    else:
        primer_seq = PARAMS["grep_primer_b"]
    grep_filter_length = PARAMS["grep_filter_length"]
    primer_subseq = primer_seq[:grep_filter_length]

    track = P.snip(os.path.basename(infile), ".fastq.1.gz")
    infile2 = track + ".fastq.2.gz"
    outfile1, outfile2 = outfiles
    tempfile = "filtered/" + track + ".filtered.fastq.1.gz"

    # filter by primer match
    fastq_in = open(infile, "r")
    fastq_out = open(tempfile, "wb")
    for read in fq.iterate(fastq_in):
        if read.seq[:grep_filter_length] == primer_subseq:
            fastq_out.writeln("@" + read.id)
            fastq_out.writeln(read.seq)
            fastq_out.writeln("+")
            fastq_out.writeln(read.qual)
    fastq_in.close()
    fastq_out.close()

    # reconcile read pairs
    statement = '''python %(scriptsdir)s/fastqs2fastq.py --method=reconcile %(tempfile)s %(infile2)s --output-pattern=filtered/%(track)s.reconciled.fastq.%%i.gz'''
    P.run()
コード例 #13
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options
        r = {33: 'sanger', 64: 'illumina', 59: 'solexa'}
        quality = r[offset]

        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]
            cmd = '''sickle se
            -g %(processing_options)s
            --qual-type %(quality)s
            --output-file %(outfile)s
            --fastq-file %(infile)s
            2>>%(output_prefix)s.log
            ;''' % locals()

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles
            cmd = '''sickle pe
            -g -s %(processing_options)s
            --qual-type %(quality)s
            -f %(infile1)s -r %(infile2)s
            -o %(outfile1)s -p %(outfile2)s
            2>>%(output_prefix)s.log
            ;''' % locals()

        return cmd
コード例 #14
0
    def build(self, infiles, outfiles, output_prefix):

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        outdir = os.path.join(output_prefix + ".dir")
        track = os.path.basename(output_prefix)

        processing_options = self.processing_options

        infile1, infile2 = infiles
        outfile = outfiles[0]

        cmd = '''flash %(infile1)s %(infile2)s
        -p %(offset)s
        %(processing_options)s
        -o %(track)s
        -d %(outdir)s
        >& %(output_prefix)s-flash.log;
        checkpoint;
        gzip %(outdir)s/*;
        checkpoint;
        mv %(outdir)s/%(track)s.extendedFrags.fastq.gz %(outfile)s;
        ''' % locals()

        return cmd
コード例 #15
0
ファイル: Sra.py プロジェクト: mmaarriiee/cgat
def peek(sra, outdir=None):
    """return the full file names for all files which will be extracted

    Parameters:

    outdir : path
        perform extraction in outdir. If outdir is None, the extraction
        will take place in a temporary directory, which will be deleted
        afterwards.
    """
    
    if outdir is None:
        workdir = tempfile.mkdtemp()
    else:
        workdir = outdir

    # --split-files creates files called prefix_#.fastq.gz,
    # where # is the read number.
    # If file cotains paired end data:
    # output = prefix_1.fastq.gz, prefix_2.fastq.gz
    #    *special case: unpaired reads in a paired end --> prefix.fastq.gz
    #    *special case: if paired reads are stored in a single read,
    #                   fastq-dump will split. There might be a joining
    #                   sequence. The output would thus be:
    #                   prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
    #                   You want files 1 and 3.

    E.run("""fastq-dump --split-files --gzip -X 1000
                 --outdir %(workdir)s %(sra)s""" % locals())
    f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz")))
    ff = [os.path.basename(x) for x in f]

    if len(f) == 1:
        # sra file contains one read: output = prefix.fastq.gz
        pass

    elif len(f) == 2:
        # sra file contains read pairs:
        # output = prefix_1.fastq.gz, prefix_2.fastq.gz
        assert ff[0].endswith(
            "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")

    elif len(f) == 3:
        if ff[2].endswith("_3.fastq.gz"):
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))
        else:
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))

    # check format of fastqs in .sra
    fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)

    if outdir is None:
        shutil.rmtree(workdir)

    return f, fastq_format
コード例 #16
0
ファイル: fastq2N.py プロジェクト: BioinformaticsArchive/cgat
def replace(fastqfile, baseToReplace):
    '''replaces the specified base with N'''

    # use gzip as default to open the fastq file
    outf = gzip.open("replaced_" + fastqfile, "w")
    fastq = gzip.open(fastqfile)
    iterator = Fastq.iterate(fastq)
    for record in iterator:
        x = list(record.seq)
        x[int(baseToReplace)] = "N"
        record.seq = "".join(x)
        outf.write("@" + record.identifier + "\n" + record.seq + "\n" + "+" + record.identifier + "\n" + record.quals + "\n")
コード例 #17
0
ファイル: fastq2N.py プロジェクト: santayana/cgat
def replace(fastqfile, baseToReplace):
    '''replaces the specified base with N'''

    # use gzip as default to open the fastq file
    outf = gzip.open("replaced_" + fastqfile, "w")
    fastq = gzip.open(fastqfile)
    iterator = Fastq.iterate(fastq)
    for record in iterator:
        x = list(record.seq)
        x[int(baseToReplace)] = "N"
        record.seq = "".join(x)
        outf.write("@" + record.identifier + "\n" + record.seq + "\n" + "+" +
                   record.identifier + "\n" + record.quals + "\n")
コード例 #18
0
def buildExpectedCoverageOverGenomes(infiles, outfile):
    '''
    take sequence files and estimate the theoretical
    coverage we would get over genomes in the 
    sample i.e. at 1X coverage
    '''

    # if paired end then will have to multiply
    # by two
    multiply = False
    if infiles[0].endswith(".fastq.1.gz"):
        multiply = True

    # the theoretical coverage is defined as
    # (read length (L) * no. reads (N)) / genome size (G) (bp)

    # get genome sizes into memory
    genomes = open(infiles[1])
    header = genomes.readline()
    genome_sizes = {}
    for line in genomes.readlines():
        data = line[:-1].split("\t")
        gi = data[0].split("_")[1]
        size = data[1]
        genome_sizes[gi] = size

    # get the expected genome size
    expected_genome_sizes = collections.defaultdict(int)
    E.info("iterating over fastq file")
    for fastq in Fastq.iterate(IOTools.openFile(infiles[0])):
        gi = fastq.identifier.split("|")[1]
        expected_genome_sizes[gi] += 1
    E.info("iterating over fastq file: DONE")

    # get the proportion of each genome covered
    outf = open(outfile, "w")
    outf.write("gi\texpected_coverage\n")
    for gi, size in expected_genome_sizes.iteritems():
        if multiply:
            size = size * 2
        if gi not in genome_sizes:
            E.warn("could not find gi no. %s in dictionary" % gi)
            continue
        proportion_coverage = float(size) / float(genome_sizes[gi])
        if proportion_coverage > 1:
            proportion_coverage = 1
        outf.write("%s\t%f\n" % (gi, proportion_coverage))
    outf.close()
コード例 #19
0
def buildExpectedCoverageOverGenomes(infiles, outfile):
    '''
    take sequence files and estimate the theoretical
    coverage we would get over genomes in the 
    sample i.e. at 1X coverage
    '''

    # if paired end then will have to multiply
    # by two
    multiply = False
    if infiles[0].endswith(".fastq.1.gz"):
        multiply = True

    # the theoretical coverage is defined as
    # (read length (L) * no. reads (N)) / genome size (G) (bp)

    # get genome sizes into memory
    genomes = open(infiles[1])
    header = genomes.readline()
    genome_sizes = {}
    for line in genomes.readlines():
        data = line[:-1].split("\t")
        gi = data[0].split("_")[1]
        size = data[1]
        genome_sizes[gi] = size

    # get the expected genome size
    expected_genome_sizes = collections.defaultdict(int)
    E.info("iterating over fastq file")
    for fastq in Fastq.iterate(IOTools.openFile(infiles[0])):
        gi = fastq.identifier.split("|")[1]
        expected_genome_sizes[gi] += 1
    E.info("iterating over fastq file: DONE")

    # get the proportion of each genome covered
    outf = open(outfile, "w")
    outf.write("gi\texpected_coverage\n")
    for gi, size in expected_genome_sizes.iteritems():
        if multiply:
            size = size * 2
        if gi not in genome_sizes:
            E.warn("could not find gi no. %s in dictionary" % gi)
            continue
        proportion_coverage = float(size) / float(genome_sizes[gi])
        if proportion_coverage > 1:
            proportion_coverage = 1
        outf.write("%s\t%f\n" % (gi, proportion_coverage))
    outf.close()
コード例 #20
0
    def build(self, infile, outfile, processer_list):
        '''run mapper.'''
        f_format = Fastq.guessFormat(
            IOTools.openFile(infile[0], "r"), raises=False)

        cmd_process, cmd_post, processed_files = self.process(
            infile[0], processer_list, outfile, f_format, save=self.save)
        cmd_clean = self.cleanup(outfile)

        assert cmd_process.strip().endswith(";")
        assert cmd_post.strip().endswith(";")
        assert cmd_clean.strip().endswith(";")

        statement = " checkpoint; ".join((cmd_process,
                                          cmd_post,
                                          cmd_clean))
        return statement
コード例 #21
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-pm",
                      "--profilematrix",
                      dest="matrixfile",
                      type="string",
                      help="name of profile file you want to convert")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)
    #outf = IOTools.openFile("my_output", "w")
    for line in options.matrixfile:
        line = line.strip()
        fields = line.split()
        total = sum([float(col) for col in fields[1:]])
        if total == 0:
            continue
        else:
            for i, col in enumerate(fields):
                if i == 0: continue
                fields[i] = col / total
    options.stdout.write("\t".join(map(str, fields)))

    for fasta_read in FastaIterator.iterate(IOTools.openFile(
            options.fastafile)):
        read_sequence = fasta_read.sequence
        read_name = fasta_read.title
        quals = '.' * len(read_sequence)

        new_fastq = Fastq.Record(identifier=read_name,
                                 seq=read_sequence,
                                 quals=quals)
        new_fastq.fromPhred([30] * len(read_sequence), format='illumina-1.8')
        options.stdout.write(str(new_fastq) + "\n")
    # write footer and output benchmark information.
    E.Stop()
コード例 #22
0
def preprocessIdba(infile, outfile):
    '''
    preprocess pooled reads for IDBA
    '''
    # check for second read in the pair
    if infile.endswith(".fastq.gz"):
        E.info("converting fastq file to fasta file")
        outf = open(outfile, "w")
        for fastq in Fastq.iterate(IOTools.openFile(infile)):
            outf.write("%s\n%s\n" % (">" + fastq.identifier, fastq.seq))
        outf.close()
    elif infile.endswith(".1.gz"):
        read2 = P.snip(infile, ".1.gz") + ".2.gz"
        assert os.path.exists(read2), "file does not exist %s" % read2

        statement = '''python %(scriptsdir)s/fastqs2fasta.py
                   -a %(infile)s
                   -b %(read2)s
                   --log=%(infile)s.log
                   > %(outfile)s'''
        P.run()
コード例 #23
0
ファイル: Sra.py プロジェクト: BioXiao/cgat
def peek(sra, outdir):
    ''' returns the full file names for all files which will be extracted'''
    # --split-files creates files called prefix_#.fastq.gz,
    # where # is the read number.
    # If file cotains paired end data:
    # output = prefix_1.fastq.gz, prefix_2.fastq.gz
    #    *special case: unpaired reads in a paired end --> prefix.fastq.gz
    #    *special case: if paired reads are stored in a single read,
    #                   fastq-dump will split. There might be a joining
    #                   sequence. The output would thus be:
    #                   prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
    #                   You want files 1 and 3.

    E.run("""fastq-dump --split-files --gzip -X 1000
                 --outdir %(outdir)s %(sra)s""" % locals())
    f = sorted(glob.glob(os.path.join(outdir, "*.fastq.gz")))
    ff = [os.path.basename(x) for x in f]

    if len(f) == 1:
        # sra file contains one read: output = prefix.fastq.gz
        pass

    elif len(f) == 2:
        # sra file contains read pairs:
        # output = prefix_1.fastq.gz, prefix_2.fastq.gz
        assert ff[0].endswith(
            "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")

    elif len(f) == 3:
        if ff[2].endswith("_3.fastq.gz"):
            f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz"))
        else:
            f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz"))

    # check format of fastqs in .sra
    fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)

    return f, fastq_format
コード例 #24
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options

        assert len(infiles) == len(outfiles)

        cmds = []
        for infile, outfile in zip(infiles, outfiles):

            cmds.append('''zcat %(infile)s
            | fastx_trimmer
            -Q%(offset)s
            %(processing_options)s
            2>> %(output_prefix)s.log
            | gzip > %(outfile)s
            ;''' % locals())

        return " checkpoint; ".join(cmds)
コード例 #25
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options

        assert len(infiles) == len(outfiles)

        cmds = []
        for infile, outfile in zip(infiles, outfiles):

            cmds.append('''zcat %(infile)s
            | fastx_trimmer
            -Q%(offset)s
            %(processing_options)s
            2>> %(output_prefix)s.log
            | gzip > %(outfile)s
            ;''' % locals())

        return " checkpoint; ".join(cmds)
コード例 #26
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        offset = Fastq.getOffset("sanger", raises=False)
        threads = self.threads
        processing_options = self.processing_options
        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]

            cmd = '''trimmomatic SE
            -threads %(threads)i
            -phred%(offset)s
            %(infile)s %(outfile)s
            %(processing_options)s
            2>> %(output_prefix)s.log
            ;''' % locals()

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles

            cmd = '''trimmomatic PE
            -threads %(threads)i
            -phred%(offset)s
            %(infile1)s %(infile2)s
            %(outfile1)s %(output_prefix)s.1.unpaired
            %(outfile2)s %(output_prefix)s.2.unpaired
            %(processing_options)s
            2>> %(output_prefix)s.log;
            checkpoint;
            gzip %(output_prefix)s.*.unpaired;
            ''' % locals()

        return cmd
コード例 #27
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        offset = Fastq.getOffset("sanger", raises=False)
        threads = self.threads
        processing_options = self.processing_options
        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]

            cmd = '''trimmomatic SE
            -threads %(threads)i
            -phred%(offset)s
            %(infile)s %(outfile)s
            %(processing_options)s
            2>> %(output_prefix)s.log
            ;''' % locals()

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles

            cmd = '''trimmomatic PE
            -threads %(threads)i
            -phred%(offset)s
            %(infile1)s %(infile2)s
            %(outfile1)s %(output_prefix)s.1.unpaired
            %(outfile2)s %(output_prefix)s.2.unpaired
            %(processing_options)s
            2>> %(output_prefix)s.log;
            checkpoint;
            gzip %(output_prefix)s.*.unpaired;
            ''' % locals()

        return cmd
コード例 #28
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options
        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]
            outdir = os.path.dirname(outfile)
            trim_out = "%s/%s_trimmed.fq.gz" % (
                outdir, infile.replace(".fastq.gz", ""))
            cmd = '''trim_galore %(processing_options)s
            --phred%(offset)s
            --output_dir %(outdir)s
            %(infile)s
            2>>%(output_prefix)s.log;
            mv %(trim_out)s %(outfile)s;
            ''' % locals()
            outfiles = (outfile, )

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles
            outdir = os.path.dirname(outfile1)
            cmd = '''trim_galore %(processing_options)s
            --paired
            --phred%(offset)s
            --output_dir %(outdir)s
            %(infile1)s %(infile2)s
            2>>%(output_prefix)s.log;
            mv %(outdir)s/%(infile1)s_val_1.fq.gz %(outfile1)s;
            mv %(outdir)s/%(infile2)s_val_2.fq.gz %(outfile2)s;
            ''' % locals()

        return cmd
コード例 #29
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options

        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]
            trim_out = "%s_trimmed.fq.gz" % (output_prefix)
            cmd = '''trim_galore %(processing_options)s
            --phred%(offset)s
            --output_dir %(outdir)s
            %(infile)s
            2>>%(output_prefix)s.log;
            mv %(trim_out)s %(outfile)s;
            ''' % locals()
            outfiles = (outfile,)

        elif self.num_files == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles

            cmd = '''trim_galore %(processing_options)s
            --paired
            --phred%(offset)s
            --output_dir %(outdir)s
            %(infile1)s %(infile2)s
            2>>%(output_prefix)s.log;
            mv %(infile1)s_val_1.fq.gz %(outfile1)s;
            mv %(infile2)s_val_2.fq.gz %(outfile2)s;
            ''' % locals()

        return cmd
コード例 #30
0
ファイル: extract_umi.py プロジェクト: jdblischak/UMI-tools
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--split-barcode", dest="split", action="store_true",
                      help="barcode is split across read pair")
    parser.add_option("-p", "--bc-pattern", dest="pattern", type="string",
                      help="Barcode pattern. Ns are random bases X's fixed")
    parser.add_option("--bc-pattern2", dest="pattern2", type="string",
                      help="Barcode pattern. Ns are random bases X's fixed")
    parser.add_option("--read2-in", dest="read2_in", type="string",
                      help="file name for read pairs")
    parser.add_option("--3prime", dest="prime3", action="store_true",
                      help="barcode is on 3' end of read")
    parser.add_option("--read2-out", dest="read2_out", type="string",
                      help="file to output processed paired read to")
    parser.add_option("--supress-stats", dest="stats", action="store_false",
                      help="Suppress the writing of stats to the log")

    parser.set_defaults(split=False,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        read2_out=None,
                        prime3=False,
                        stats=True)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = E.Start(parser, argv=argv)

    # check options
    if not options.pattern:
        raise ValueError("must specify a pattern using ``--bc-pattern``")

    if options.split:
        if not options.read2_in:
            raise ValueError("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    if options.read2_in:
        if not options.read2_out:
            raise ValueError("must specify an output for the paired end "
                             "``--read2-out``")


    # Initialise the processor
    processor = Extractor(options.pattern, options.pattern2, options.prime3)
    read1s = Fastq.iterate(options.stdin)

    if options.read2_in is None:

        for read in read1s:
            options.stdout.write(str(processor(read)) + "\n")

    else:

        read2s = Fastq.iterate(IOTools.openFile(options.read2_in))
        read2_out = IOTools.openFile(options.read2_out, "w")

        for read1, read2 in zip(read1s, read2s):
            new_1, new_2 = processor(read1, read2)
            options.stdout.write(str(new_1) + "\n")
            read2_out.write(str(new_2) + "\n")

    # write footer and output benchmark information.

    if options.stats:
     
        options.stdlog.write("\t".join(["Barcode", "UMI", "Sample", "Count"]) + "\n")
        for id in processor.bc_count:
            options.stdlog.write("\t".join(id+(str(processor.bc_count[id]),)) + "\n")
                             
    E.Stop()
コード例 #31
0
ファイル: fastq2table.py プロジェクト: gsc0107/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The default behaviour of the script is to guess the quality "
        "format of the input fastq file. The user can specify the "
        "quality format of the input file using the --guess-format option. "
        "The script will use this format if the "
        "sequence qualities are ambiguous.[default=%default].")

    parser.add_option(
        "--target-format",
        dest="target_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The script will convert quality scores to the destination "
        "format unless [default=%default].")

    parser.set_defaults(
        target_format=None,
        guess_format=None,
        min_quality=10,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    if options.target_format:
        iterator = Fastq.iterate_convert(options.stdin,
                                         format=options.target_format,
                                         guess=options.guess_format)
    else:
        iterator = Fastq.iterate_guess(options.stdin,
                                       guess=options.guess_format)

    options.stdout.write("read\tnfailed\tnN\t%s\n" %
                         ("\t".join(Stats.Summary().getHeaders())))

    min_quality = options.min_quality

    for record in iterator:
        c.input += 1
        quals = record.toPhred()
        nfailed = len([x for x in quals if x < min_quality])
        nns = record.seq.count("N") + record.seq.count(".")
        options.stdout.write(
            "%s\t%i\t%i\t%s\n" %
            (record.identifier, nfailed, nns, str(Stats.Summary(quals))))
        c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
コード例 #32
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--guess-format",
                      dest="guess_format",
                      type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'illumina-1.8',
                               'integer'),
                      help="The default behaviour of the script is to guess \
        the quality format of the input fastq file. The user \
        can specify the quality format of the input file using \
        the --format option. The script will use this format if \
        sequences qualities are ambiguous.[default=%default].")

    parser.add_option(
        "-f",
        "--target-format",
        dest="change_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
        help="The script guesses the quality format of the input \
        file and converts quality scores to the destination \
        format unless --format is specified [default=%default].")

    parser.set_defaults(change_format=None, guess_format=None, min_quality=10)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.change_format:
        iterator = Fastq.iterate_convert(options.stdin,
                                         format=options.change_format,
                                         guess=options.guess_format)
    else:
        iterator = Fastq.iterate_guess(options.stdin,
                                       guess=options.guess_format)

    min_quality = options.min_quality
    number_of_reads = 0
    number_of_bases = 0
    read_lengths = []
    read_qualities = []
    bases_below_min = 0

    for record in iterator:
        number_of_reads += 1
        quals = record.toPhred()
        length_read = len(quals)
        number_of_bases += length_read
        bases_below_min += len([x for x in quals if x < min_quality])
        read_lengths.append(length_read)
        read_qualities.append(np.mean(quals))

    mean_length = round(np.mean(read_lengths), 2)
    median_length = round(np.median(read_lengths), 2)
    mean_quality = round(np.mean(read_qualities), 2)
    median_quality = round(np.median(read_qualities), 2)

    options.stdout.write(
        "reads\tbases\tmean_length\tmedian_length\tmean_quality\tmedian_quality\tnfailed\n"
    )

    options.stdout.write("%i\t%i\t%s\t%s\t%s\t%s\t%i\n" %
                         (number_of_reads, number_of_bases, str(mean_length),
                          str(median_length), str(mean_quality),
                          str(median_quality), bases_below_min))
    E.Stop()
コード例 #33
0
ファイル: Sra.py プロジェクト: ptdtan/cgat
def peek(sra, outdir=None):
    """return the full file names for all files which will be extracted

    Parameters
    ----------

    outdir : path
        perform extraction in outdir. If outdir is None, the extraction
        will take place in a temporary directory, which will be deleted
        afterwards.

    Returns
    -------
    files : list
        A list of fastq formatted files that are contained in the archive.
    format : string
        The quality score format in the :term:`fastq` formatted files.

    """
    
    if outdir is None:
        workdir = tempfile.mkdtemp()
    else:
        workdir = outdir

    # --split-files creates files called prefix_#.fastq.gz,
    # where # is the read number.
    # If file cotains paired end data:
    # output = prefix_1.fastq.gz, prefix_2.fastq.gz
    #    *special case: unpaired reads in a paired end --> prefix.fastq.gz
    #    *special case: if paired reads are stored in a single read,
    #                   fastq-dump will split. There might be a joining
    #                   sequence. The output would thus be:
    #                   prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
    #                   You want files 1 and 3.

    E.run("""fastq-dump --split-files --gzip -X 1000
                 --outdir %(workdir)s %(sra)s""" % locals())
    f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz")))
    ff = [os.path.basename(x) for x in f]

    if len(f) == 1:
        # sra file contains one read: output = prefix.fastq.gz
        pass

    elif len(f) == 2:
        # sra file contains read pairs:
        # output = prefix_1.fastq.gz, prefix_2.fastq.gz
        assert ff[0].endswith(
            "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")

    elif len(f) == 3:
        if ff[2].endswith("_3.fastq.gz"):
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))
        else:
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))

    # check format of fastqs in .sra
    fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)

    if outdir is None:
        shutil.rmtree(workdir)

    return f, fastq_format
コード例 #34
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--target-format",
                      dest="change_format",
                      type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer'),
                      help="set quality scores to format "
                      "[default=%default].")

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option("--pattern-identifier",
                      dest="pattern",
                      type="string",
                      help="filename prefix [default=%default].")

    parser.set_defaults(change_format=None, guess_format=None, pattern="%s.gz")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    c = E.Counter()

    outfile_seq = IOTools.open_file(options.pattern % "csfasta", "w")
    outfile_qual = IOTools.open_file(options.pattern % "qual", "w")

    if options.change_format:
        iter = Fastq.iterate_convert(options.stdin,
                                     format=options.change_format,
                                     guess=options.guess_format)
    else:
        iter = Fastq.iterate(options.stdin)

    for record in iter:
        c.input += 1
        outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq))
        outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals))
        c.output += 1

    outfile_seq.close()
    outfile_qual.close()

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
コード例 #35
0
ファイル: fastq2table.py プロジェクト: lesheng/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("--guess-format", dest="guess_format", type="choice",
                      choices=(
                          'sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
                      help="The default behaviour of the script is to guess the quality format of the input fastq file. The user can specify \
                            the quality format of the input file using the --format option. The script will use this format if the \
                            sequence qualities are ambiguous.[default=%default]."  )

    parser.add_option("-f", "--change-format", dest="change_format", type="choice",
                      choices=(
                          'sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'),
                      help="The script will guess the quality format of the input file and convert \
                            quality scores to the destination format unless --format is specified [default=%default]."  )

    parser.set_defaults(
        change_format=None,
        guess_format=None,
        min_quality=10,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    if options.change_format:
        iterator = Fastq.iterate_convert(options.stdin,
                                         format=options.change_format,
                                         guess=options.guess_format)
    else:
        iterator = Fastq.iterate_guess(options.stdin,
                                       guess=options.guess_format)

    options.stdout.write("read\tnfailed\tnN\t%s\n" %
                         ("\t".join(Stats.Summary().getHeaders())))

    min_quality = options.min_quality

    for record in iterator:
        c.input += 1
        quals = record.toPhred()
        nfailed = len([x for x in quals if x < min_quality])
        nns = record.seq.count("N") + record.seq.count(".")
        options.stdout.write("%s\t%i\t%i\t%s\n" % (record.identifier,
                                                   nfailed,
                                                   nns,
                                                   str(Stats.Summary(quals))
                                                   ))
        c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
コード例 #36
0
    def preprocess( self, infiles, outfile ):
        '''build preprocessing statement

        Build a command line statement that extracts/converts 
        various input formats to fastq formatted files.

        Mapping qualities are changed to solexa format.

        returns the statement and the fastq files to map.
        '''

        assert len(infiles) > 0, "no input files for mapping"

        tmpdir_fastq = P.getTempDir()

        # create temporary directory again for nodes
        statement = [ "mkdir -p %s" % tmpdir_fastq ]
        fastqfiles = []

        # get track by extension of outfile
        track = os.path.splitext( os.path.basename( outfile ) )[0]

        if self.compress:
            compress_cmd = "| gzip"
            extension = ".gz"
        else:
            compress_cmd = ""
            extension = ""

        for infile in infiles:

            if infile.endswith( ".export.txt.gz"):
                # single end illumina export
                statement.append( """gunzip < %(infile)s 
                     | awk '$11 != "QC" || $10 ~ /(\d+):(\d+):(\d+)/ \
                        { if ($1 != "") 
                             { readname=sprintf( "%%%%s_%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$2,$3,$4,$5,$6);}
                        else { readname=sprintf( "%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$3,$4,$5,$6); }
                       printf("@%%%%s\\n%%%%s\\n+\\n%%%%s\\n",readname,$9,$10);}'
                     %(compress_cmd)s
                     > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
                fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension ),) )
            elif infile.endswith( ".fa.gz" ):
                statement.append( '''gunzip < %(infile)s > %(tmpdir_fastq)s/%(track)s.fa''' % locals() )
                fastqfiles.append( ("%s/%s.fa" % (tmpdir_fastq, track ),) )
                self.datatype = "fasta"
                
            elif infile.endswith( ".sra"):
                # sneak preview to determine if paired end or single end
                outdir = P.getTempDir()
                # --split-files is present in fastq-dump 2.1.7
                P.execute( "fastq-dump --split-files --gzip -X 1000 --outdir %(outdir)s %(infile)s" % locals() )
                # --split-files will create files called prefix_#.fastq.gz
                # where # is the read number. 
                # The following cases are:

                # * file cotains paired end data: output = prefix_1.fastq.gz, prefix_2.fastq.gz
                #    * special case: unpaired reads in a paired end run end up in prefix.fastq.gz
                #    * special case: if paired reads are stored in a single read, fastq-dump will split.
                #       There might be a joining sequence. The output would thus be:
                #       prefix_1.fastq.gz, prefix_2.fastq.gz and prefix_3.fastq.gz
                #      You want files 1 and 3.
                f = sorted(glob.glob( os.path.join( outdir, "*.fastq.gz" ) ))
                ff = [ os.path.basename(x) for x in f ]
                if len(f) == 1: 
                    # sra file contains one read: output = prefix.fastq.gz
                    pass
                elif len(f) == 2:
                    # sra file contains read pairs: output = prefix_1.fastq.gz, prefix_2.fastq.gz
                    assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith( "_2.fastq.gz" )
                elif len(f) == 3:
                    if ff[2].endswith( "_3.fastq.gz"):
                        f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) )
                    else:
                        f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) )
                E.info("sra file contains the following files: %s" % f )
                shutil.rmtree( outdir )
                fastqfiles.append( [ "%s/%s" % (tmpdir_fastq, os.path.basename( x )) for x in sorted(f) ] )
                statement.append( "fastq-dump --split-files --gzip --outdir %(tmpdir_fastq)s %(infile)s" % locals() )
                
            elif infile.endswith( ".fastq.gz" ):
                format = Fastq.guessFormat( IOTools.openFile( infile, "r"), raises = False)
                if 'sanger' not in format and self.convert:
                    statement.append(  """gunzip < %(infile)s 
                                      | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log
                                      %(compress_cmd)s
                                      > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
                    fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) )
                else:
                    E.debug( "%s: assuming quality score format %s" % (infile, format ) ) 
                    fastqfiles.append( (infile, ) )

            elif infile.endswith( ".csfasta.gz" ):
                # single end SOLiD data
                if self.preserve_colourspace:
                    quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz"
                    if not os.path.exists( quality ):
                        raise ValueError( "no quality file for %s" % infile )
                    statement.append(  """gunzip < %(infile)s 
                                          > %(tmpdir_fastq)s/%(track)s.csfasta%(extension)s""" % locals() )
                    statement.append(  """gunzip < %(quality)s 
                                          > %(tmpdir_fastq)s/%(track)s.qual%(extension)s""" % locals() )
                    fastqfiles.append( ("%s/%s.csfasta%s" % (tmpdir_fastq, track, extension ),
                                        "%s/%s.qual%s" % (tmpdir_fastq, track, extension) ) )
                    self.datatype = "solid"
                else:
                    quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz"

                    statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s)
                                      %(compress_cmd)s
                                      > %(tmpdir_fastq)s/%(track)s.fastq%(extension)""" % locals() )
                    fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) )

            elif infile.endswith( ".csfasta.F3.gz" ):
                # paired end SOLiD data
                if self.preserve_colourspace:
                    bn = P.snip( infile, ".csfasta.F3.gz" )
                    # order is important - mirrors tophat reads followed by quals
                    f = []
                    for suffix in ("csfasta.F3", "csfasta.F5", "qual.F3", "qual.F5" ):
                        fn = "%(bn)s.%(suffix)s" % locals()
                        if not os.path.exists( fn + ".gz"): raise ValueError( "expected file %s.gz missing" % fn )
                        statement.append( """gunzip < %(fn)s.gz
                                          %(compress_cmd)s
                                          > %(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s""" % locals() )
                        f.append( "%(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s" % locals() )
                    fastqfiles.append( f )
                    self.datatype = "solid"
                else:
                    quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz"

                    statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s)
                                      %(compress_cmd)s
                                      > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
                    fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) )
                

            elif infile.endswith( ".fastq.1.gz" ):

                bn = P.snip( infile, ".fastq.1.gz" )
                infile2 = "%s.fastq.2.gz" % bn
                if not os.path.exists( infile2 ):
                    raise ValueError("can not find paired ended file '%s' for '%s'" % (infile2, infile))
                
                format = Fastq.guessFormat( IOTools.openFile( infile ), raises = False )
                if 'sanger' not in format:
                    statement.append( """gunzip < %(infile)s 
                                     | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log
                                     %(compress_cmd)s
                                     > %(tmpdir_fastq)s/%(track)s.1.fastq%(extension)s;
                                     gunzip < %(infile2)s 
                                     | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log
                                     %(compress_cmd)s
                                     > %(tmpdir_fastq)s/%(track)s.2.fastq%(extension)s
                                 """ % locals() )
                    fastqfiles.append( ("%s/%s.1.fastq%s" % (tmpdir_fastq, track, extension),
                                        "%s/%s.2.fastq%s" % (tmpdir_fastq, track, extension) ) )

                else:
                    E.debug( "%s: assuming quality score format %s" % (infile, format ) ) 
                    fastqfiles.append( (infile, 
                                        infile2, ) )
                    
            else:
                raise NotImplementedError( "unknown file format %s" % infile )

        
        self.tmpdir_fastq = tmpdir_fastq

        assert len(fastqfiles) > 0, "no fastq files for mapping"

        return "; ".join( statement) + ";", fastqfiles
コード例 #37
0
ファイル: pipeline_preprocess.py プロジェクト: yangjl/cgat
def processReads( infiles, outfile ):
    '''process reads.'''

    infile, contaminant_file = infiles

    do_sth = False
    to_cluster = True

    infile2 = checkPairs( infile )

    if infile2:
        track = P.snip( outfile, ".fastq.1.gz" )        
        outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz"
    else:
        track = P.snip( outfile, ".fastq.gz" )


    if PARAMS["process_combine_reads"]:
        E.warn("combining reads cannot be can not be combined with other processing for paired ended reads")
        if not infile2: raise IOError("must have paired data to combine reads")

        read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \
            PARAMS["combine_reads_fragment_length"], \
            PARAMS["combine_reads_fragment_length_stdev"]

        fragment_options = " ".join(map(str,[read_len, frag_len, frag_stdev]))

        if PARAMS["combine_reads_max_overlap"]:
            E.warn("if specifying --max-overlap read and fragment length options will be ignored")
            max_overlap="--max-overlap=%i" % PARAMS["combine_reads_max_overlap"]
            fragment_options = ""

        elif not PARAMS["combine_reads_max_overlap"] and len(fragment_options.strip().split(" ")) < 3:
            E.warn("have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used")
            max_overlap = ""
            fragment_options = ""

        elif PARAMS["combine_reads_read_length"] and PARAMS["combine_reads_fragment_length"] and PARAMS["combine_reads_fragment_length_stdev"]:
            if PARAMS["combine_reads_max_overlap"]:
                E.warn("--max-overlap will override the specified read and fragment length options")
            max_overlap = ""
            fragment_options = """--read-len=%(read_len)i
                                  --fragment-len=%(frag_len)i
                                  --fragment-len-stddev=%(frag_stdev)i""" % locals() 
        else:
            max_overlap = ""
            fragment_options = ""

        if not PARAMS["combine_reads_min_overlap"]:
            min_overlap = ""
        else:
            min_overlap = "--min-overlap=%i" % PARAMS["combine_reads_min_overlap"]
        if not PARAMS["combine_reads_threads"]:
            threads = ""
        else:
            threads = "--threads=%i" % PARAMS["combine_reads_threads"]
        if not PARAMS["combine_reads_phred_offset"]:
            phred_offset = ""
        else:
            phred_offset = "--phred-offset=%i" % PARAMS["combine_reads_phred_offset"]
        if not PARAMS["combine_reads_max_mismatch_density"]:
            max_mismatch_density = ""
        else:
            max_mismatch_density = "--max-mismatch-density=%f" % PARAMS["combine_reads_max_mismatch_density"]

        statement = '''flash 
                     %(min_overlap)s
                     %(max_overlap)s
                     %(max_mismatch_density)s
                     %(phred_offset)s
                     %(fragment_options)s
                     --output-prefix=%(track)s
                     %(threads)s
                     --compress
                     %(infile)s %(infile2)s >> %(outfile)s.log
                     '''
        P.run()
        if PARAMS["combine_reads_concatenate"]:
            infiles = " ".join([track + x for x in  [".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz"]])
            statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s'''
        else:
            statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s'''
        P.run()
        return


    if PARAMS["process_sample"] and infile2:
        E.warn( "sampling can not be combined with other processing for paired ended reads")
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False)
    E.info( "%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset( format, raises = False )

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
#              %(contamination_trim_type)s
        s = [ '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        ''' ]
        do_sth = True
    else:
        s = ['zcat %(infile)s' ]

    if PARAMS["process_artifacts"]:
        s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append( 'fastq_quality_trimmer -Q %(offset)i  -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    if PARAMS["process_filter"]:
        s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log')
        do_sth = True


    if PARAMS["process_sample"]:
        s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' )

    if not do_sth:
        E.warn( "no filtering specified for %s - nothing done" % infile )
        return

    s.append( "gzip" )
    if not infile2:
        statement = " | ".join( s ) + " > %(outfile)s" 
        P.run()
    else:
        tmpfile = P.getTempFilename(".")
        tmpfile1 = tmpfile + ".fastq.1.gz"
        tmpfile2 = tmpfile + ".fastq.2.gz"

        E.warn( "processing first of pair")
        # first read pair
        statement = " | ".join( s ) + " > %(tmpfile1)s" 
        P.run()

        # second read pair        
        E.warn( "processing second of pair")
        infile = infile2
        statement = " | ".join( s ) + " > %(tmpfile2)s" 
        P.run()

        # reconcile
        E.info("starting reconciliation" )
        statement = """python %(scriptsdir)s/fastqs2fastqs.py
                           --method=reconcile
                           --output-pattern=%(track)s.fastq.%%s.gz
                           %(tmpfile1)s %(tmpfile2)s
                     > %(outfile)s_reconcile.log"""
        
        P.run()

        os.unlink( tmpfile1 )
        os.unlink( tmpfile2 )
        os.unlink( tmpfile )
コード例 #38
0
ファイル: fastq2fastq.py プロジェクト: mint1234/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("apply", "change-format", "renumber-reads",
                               "sample", "sort", "trim3", "trim5", "unique",
                               "grep"),
                      help="method to apply [%default]")

    parser.add_option("--target-format",
                      dest="target_format",
                      type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer',
                               'illumina-1.8'),
                      help="guess quality score format and set quality scores "
                      "to format [default=%default].")

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option(
        "--sample-size",
        dest="sample_size",
        type="float",
        help="proportion of reads to sample. "
        "Provide a proportion of reads to sample, e.g. 0.1 for 10%, "
        "0.5 for 50%, etc [default=%default].")

    parser.add_option("--pair-fastq-file",
                      dest="pair",
                      type="string",
                      help="if data is paired, filename with second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option(
        "--map-tsv-file",
        dest="map_tsv_file",
        type="string",
        help="filename with tab-separated identifiers mapping for "
        "method apply [default=%default].")

    parser.add_option("--num-bases",
                      dest="nbases",
                      type="int",
                      help="number of bases to trim [default=%default].")

    parser.add_option(
        "--seed",
        dest="seed",
        type="int",
        help="seed for random number generator [default=%default].")

    parser.add_option(
        "--pattern-identifier",
        dest="renumber_pattern",
        type="string",
        help="rename reads in file by pattern [default=%default]")

    parser.add_option(
        "--grep-pattern",
        dest="grep_pattern",
        type="string",
        help="subset to reads matching pattern [default=%default]")

    parser.set_defaults(method=None,
                        change_format=None,
                        guess_format=None,
                        sample_size=0.1,
                        nbases=0,
                        pair=None,
                        apply=None,
                        seed=None,
                        renumber_pattern="read_%010i",
                        grep_pattern=".*")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    c = E.Counter()

    if options.method == "change-format":
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.target_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "grep":
        for record in Fastq.iterate(options.stdin):
            if re.match(options.grep_pattern, record.seq):
                options.stdout.write("%s\n" % record)

    elif options.method == "sample":
        sample_threshold = min(1.0, options.sample_size)

        random.seed(options.seed)

        if options.pair:
            if not options.output_filename_pattern:
                raise ValueError("please specify output filename pattern for "
                                 "second pair (--output-filename-pattern)")

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.output_filename_pattern, "w")

            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)

        else:
            for record in Fastq.iterate(options.stdin):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    options.stdout.write("%s\n" % record)

    elif options.method == "apply":
        ids = set(IOTools.readList(IOTools.openFile(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "trim3":
        trim3 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "trim5":
        trim5 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim5(trim5)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "unique":
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.method == "sort":
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename for second pair "
                    "(--output-filename-pattern)")
            E.warn("consider sorting individual fastq files - "
                   "this is memory intensive")
            entries1 = {}
            entries2 = {}

            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                entries1[record1.identifier[:-2]] = (record1.seq,
                                                     record1.quals)
                entries2[record2.identifier[:-2]] = (record2.seq,
                                                     record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.output_filename_pattern, "w")
            assert len(set(entries1.keys()).intersection(
                set(entries2.keys()))) == len(entries1),\
                "paired files do not contain the same reads "\
                "need to reconcile files"

            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.method == "renumber-reads":
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_pattern % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
コード例 #39
0
 def setfastqAttr(self, infiles):
     self.offset = Fastq.getOffset(self.f_format, raises=False)
コード例 #40
0
ファイル: fastq2summary.py プロジェクト: Charlie-George/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("--guess-format", dest="guess_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64',
                               'illumina-1.8', 'integer'),
                      help="The default behaviour of the script is to guess \
                      the quality format of the input fastq file. The user \
                      can specify the quality format of the input file using \
                      the --format option. The script will use this format if \
                      sequences qualities are ambiguous.[default=%default].")

    parser.add_option("-f", "--change-format", dest="change_format",
                      type="choice", choices=('sanger', 'solexa', 'phred64',
                                              'illumina-1.8', 'integer'),
                      help="The script guesses the quality format of the input \
                      file and converts quality scores to the destination \
                      format unless --format is specified [default=%default].")

    parser.set_defaults(
        change_format=None,
        guess_format=None,
        min_quality=10)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.change_format:
        iterator = Fastq.iterate_convert(options.stdin,
                                         format=options.change_format,
                                         guess=options.guess_format)
    else:
        iterator = Fastq.iterate_guess(options.stdin,
                                       guess=options.guess_format)

    min_quality = options.min_quality
    number_of_reads = 0
    number_of_bases = 0
    read_lengths = []
    read_qualities = []
    bases_below_min = 0

    for record in iterator:
        number_of_reads += 1
        quals = record.toPhred()
        length_read = len(quals)
        number_of_bases += length_read
        bases_below_min += len([x for x in quals if x < min_quality])
        read_lengths.append(length_read)
        read_qualities.append(np.mean(quals))

    mean_length = round(np.mean(read_lengths), 2)
    median_length = round(np.median(read_lengths), 2)
    mean_quality = round(np.mean(read_qualities), 2)
    median_quality = round(np.median(read_qualities), 2)

    options.stdout.write(
        "reads\tbases\tmean_length\tmedian_length\tmean_quality\tmedian_quality\tnfailed\n")

    options.stdout.write(
        "%i\t%i\t%s\t%s\t%s\t%s\t%i\n" % (number_of_reads, number_of_bases,
                                          str(mean_length),
                                          str(median_length),
                                          str(mean_quality),
                                          str(median_quality),
                                          bases_below_min))
    E.Stop()
コード例 #41
0
ファイル: fastqs2fastq.py プロジェクト: santayana/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=('join', ),
                      help="method to apply [default=%default].")

    parser.set_defaults(method="join", )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()
    outfile = options.stdout

    if options.method == "join":
        # merge based on diagonals in dotplot
        iter1 = Fastq.iterate(IOTools.openFile(fn1))
        iter2 = Fastq.iterate(IOTools.openFile(fn2))
        tuple_size = 2
        for left, right in zip(iter1, iter2):
            c.input += 1

            # build dictionary of tuples
            s1, q1 = left.seq, left.quals
            d = collections.defaultdict(list)
            for x in range(len(s1) - tuple_size):
                d[s1[x:x + tuple_size]].append(x)

            s2, q2 = right.seq, right.quals
            # reverse complement
            s2 = Genomics.complement(s2)
            q2 = q2[::-1]

            # compute list of offsets/diagonals
            offsets = collections.defaultdict(int)
            for x in range(len(s2) - tuple_size):
                c = s2[x:x + tuple_size]
                for y in d[c]:
                    offsets[x - y] += 1

            # find maximum diagonal
            sorted = sorted([(y, x) for x, y in offsets.items()])
            max_count, max_offset = sorted[-1]

            E.debug('%s: maximum offset at %i' % (left.identifier, max_offset))

            # simple merge sequence
            take = len(s2) - max_offset
            merged_seq = s1 + s2[take:]

            # simple merge quality scores
            merged_quals = q1 + q2[take:]

            new_entry = copy.copy(left)
            new_entry.seq = merged_seq
            new_entry.quals = merged_quals
            outfile.write(new_entry)
            c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
コード例 #42
0
ファイル: fastq2fastq.py プロジェクト: Q-KIM/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "apply",
                          "change-format",
                          "renumber-reads",
                          "sample",
                          "sort",
                          "trim3",
                          "trim5",
                          "unique",
                          "grep"),
                      help="method to apply [%default]")

    parser.add_option(
        "--target-format", dest="target_format", type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="guess quality score format and set quality scores "
        "to format [default=%default].")

    parser.add_option(
        "--guess-format", dest="guess_format", type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option(
        "--sample-size", dest="sample_size", type="float",
        help="proportion of reads to sample. "
        "Provide a proportion of reads to sample, e.g. 0.1 for 10%, "
        "0.5 for 50%, etc [default=%default].")

    parser.add_option(
        "--pair-fastq-file", dest="pair", type="string",
        help="if data is paired, filename with second pair. "
        "Implemented for sampling [default=%default].")

    parser.add_option(
        "--map-tsv-file", dest="map_tsv_file", type="string",
        help="filename with tab-separated identifiers mapping for "
        "method apply [default=%default].")

    parser.add_option(
        "--num-bases", dest="nbases", type="int",
        help="number of bases to trim [default=%default].")

    parser.add_option(
        "--seed", dest="seed", type="int",
        help="seed for random number generator [default=%default].")

    parser.add_option(
        "--pattern-identifier", dest="renumber_pattern", type="string",
        help="rename reads in file by pattern [default=%default]")

    parser.add_option(
        "--grep-pattern", dest="grep_pattern", type="string",
        help="subset to reads matching pattern [default=%default]")

    parser.set_defaults(
        method=None,
        change_format=None,
        guess_format=None,
        sample_size=0.1,
        nbases=0,
        pair=None,
        apply=None,
        seed=None,
        renumber_pattern="read_%010i",
        grep_pattern=".*")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    c = E.Counter()

    if options.method == "change-format":
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.target_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "grep":
        for record in Fastq.iterate(options.stdin):
            if re.match(options.grep_pattern, record.seq):
                options.stdout.write("%s\n" % record)

    elif options.method == "sample":
        sample_threshold = min(1.0, options.sample_size)

        random.seed(options.seed)

        if options.pair:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename pattern for "
                    "second pair (--output-filename-pattern)")

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_filename_pattern, "w")

            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if random.random() <= sample_threshold:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "apply":
        ids = set(IOTools.readList(IOTools.openFile(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "trim3":
        trim3 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "trim5":
        trim5 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim5(trim5)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "unique":
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.method == "sort":
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename for second pair "
                    "(--output-filename-pattern)")
            E.warn(
                "consider sorting individual fastq files - "
                "this is memory intensive")
            entries1 = {}
            entries2 = {}

            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                entries1[
                    record1.identifier[:-2]] = (record1.seq, record1.quals)
                entries2[
                    record2.identifier[:-2]] = (record2.seq, record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.output_filename_pattern, "w")
            assert len(set(entries1.keys()).intersection(
                set(entries2.keys()))) == len(entries1),\
                "paired files do not contain the same reads "\
                "need to reconcile files"

            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.method == "renumber-reads":
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_pattern % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
コード例 #43
0
def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bamfile",
                      dest="bamfile",
                      type="string",
                      help="input bamfile to filter reads from")
    parser.add_option("-r",
                      "--reads",
                      dest="reads",
                      type="choice",
                      choices=("mapped", "unmapped"),
                      help="type of read to keep")
    parser.add_option("-s",
                      "--scriptsdir",
                      dest="scriptsdir",
                      type="string",
                      help="CGAT scripts directory")
    parser.add_option("-i",
                      "--invert",
                      dest="invert",
                      action="store_true",
                      help="invert selection - if for example unmapped reads \
                            aren't output")

    parser.set_defaults(bamfile=None,
                        reads="mapped",
                        scriptsdir="/ifs/devel/nicki/cgat_git/cgat/scripts",
                        invert=False)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()
    c.input_alignments = 0
    c.input_reads = 0
    c.output_reads = 0

    # output text file for reads TO KEEP
    bam = pysam.Samfile(options.bamfile, "rb")
    temp = P.getTempFile(".")
    E.info("iterating over bam file")

    for alignment in bam.fetch(until_eof=True):
        c.input_alignments += 1
        if options.reads == "unmapped":
            if alignment.is_unmapped:
                #c.input_alignments += 1
                temp.write(alignment.qname + "\n")
        elif options.reads == "mapped":
            if not alignment.is_unmapped:
                #c.input_alignments += 1
                temp.write(alignment.qname + "\n")
    temp.close()

    tempname = temp.name

    E.info("filtering fastq file")
    # filter fastq file
    ids = set(IOTools.readList(IOTools.openFile(tempname).readlines()))
    c.input_alignments = len(ids)
    for fastq in Fastq.iterate(options.stdin):
        c.input_reads += 1
        if (fastq.identifier.endswith("/1") or fastq.identifier.endswith("/2")
            ) and " " not in fastq.identifier:
            identifier = fastq.identifier[:-2]
        elif len(fastq.identifier.split(" ")) == 2:
            identifier = fastq.identifier.split(" ")[0]
        else:
            identifier = fastq.identifier
        if not options.invert:
            if identifier in ids:
                c.output_reads += 1
                options.stdout.write("%s\n" % fastq)
        else:
            if identifier in ids: continue
            c.output_reads += 1
            options.stdout.write("%s\n" % fastq)

    E.info(c)

    os.unlink(tempname)

    # write footer and output benchmark information.
    E.Stop()
コード例 #44
0
ファイル: fastqs2fastq.py プロジェクト: SCV/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=('join', ),
                      help="method to apply [default=%default].")

    parser.set_defaults(
        method="join",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()
    outfile = options.stdout

    if options.method == "join":
        # merge based on diagonals in dotplot
        iter1 = Fastq.iterate(IOTools.openFile(fn1))
        iter2 = Fastq.iterate(IOTools.openFile(fn2))
        tuple_size = 2
        for left, right in zip(iter1, iter2):
            c.input += 1

            # build dictionary of tuples
            s1, q1 = left.seq, left.quals
            d = collections.defaultdict(list)
            for x in range(len(s1) - tuple_size):
                d[s1[x:x + tuple_size]].append(x)

            s2, q2 = right.seq, right.quals
            # reverse complement
            s2 = Genomics.complement(s2)
            q2 = q2[::-1]

            # compute list of offsets/diagonals
            offsets = collections.defaultdict(int)
            for x in range(len(s2) - tuple_size):
                c = s2[x:x + tuple_size]
                for y in d[c]:
                    offsets[x - y] += 1

            # find maximum diagonal
            sorted = sorted([(y, x) for x, y in offsets.items()])
            max_count, max_offset = sorted[-1]

            E.debug('%s: maximum offset at %i' % (left.identifier,
                                                  max_offset))

            # simple merge sequence
            take = len(s2) - max_offset
            merged_seq = s1 + s2[take:]

            # simple merge quality scores
            merged_quals = q1 + q2[take:]

            new_entry = copy.copy(left)
            new_entry.seq = merged_seq
            new_entry.quals = merged_quals
            outfile.write(new_entry)
            c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
コード例 #45
0
ファイル: fastq2fastq.py プロジェクト: logust79/cgat-apps
def process_cgat(options):

    c = E.Counter()

    assert options.input_fastq_file == "-"

    if options.method == "change-format":
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.target_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "grep":
        for record in Fastq.iterate(options.stdin):
            if re.match(options.grep_pattern, record.seq):
                options.stdout.write("%s\n" % record)

    elif options.method == "reverse-complement":
        for record in Fastq.iterate(options.stdin):
            record.seq = Genomics.complement(record.seq)
            record.quals = record.quals[::-1]
            options.stdout.write("%s\n" % record)

    elif options.method == "sample":
        sample_threshold = min(1.0, options.sample_size)

        random.seed(options.seed)

        if options.pair:
            if not options.output_filename_pattern:
                raise ValueError("please specify output filename pattern for "
                                 "second pair (--output-filename-pattern)")

            outfile1 = options.stdout
            outfile2 = IOTools.open_file(options.output_filename_pattern, "w")

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.open_file(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)
        else:
            for record in Fastq.iterate(options.stdin):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    options.stdout.write("%s\n" % record)

    elif options.method == "apply":
        ids = set(IOTools.read_list(IOTools.open_file(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "trim3":
        trim3 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "trim5":
        trim5 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim5(trim5)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "unique":
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.method == "sort":
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename for second pair "
                    "(--output-filename-pattern)")
            E.warn("consider sorting individual fastq files - "
                   "this is memory intensive")
            entries1 = {}
            entries2 = {}

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.open_file(options.pair))):
                entries1[record1.identifier[:-2]] = (record1.seq,
                                                     record1.quals)
                entries2[record2.identifier[:-2]] = (record2.seq,
                                                     record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.open_file(options.output_filename_pattern, "w")
            assert len(set(entries1.keys()).intersection(
                set(entries2.keys()))) == len(entries1),\
                "paired files do not contain the same reads "\
                "need to reconcile files"

            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.method == "renumber-reads":
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_pattern % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))
    return c
コード例 #46
0
ファイル: pipeline_readqc.py プロジェクト: siping/cgat
def processReads( infiles, outfile ):
    '''process reads.'''

    infile, contaminant_file = infiles

    do_sth = False
    to_cluster = True

    infile2 = checkPairs( infile )

    if infile2:
        track = P.snip( outfile, ".fastq.1.gz" )        
        outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz"
    else:
        track = P.snip( outfile, ".fastq.gz" )


    if PARAMS["process_sample"] and infile2:
        E.warn( "sampling can not be combined with other processing for paired ended reads")
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False)
    E.info( "%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset( format, raises = False )

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
#              %(contamination_trim_type)s
        s = [ '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        ''' ]
        do_sth = True
    else:
        s = ['zcat %(infile)s' ]

    if PARAMS["process_artifacts"]:
        s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append( 'fastq_quality_trimmer -Q %(offset)i  -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    if PARAMS["process_filter"]:
        s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log')
        do_sth = True

    if PARAMS["process_sample"]:
        s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' )

    if not do_sth:
        E.warn( "no filtering specified for %s - nothing done" % infile )
        return

    s.append( "gzip" )
    if not infile2:
        statement = " | ".join( s ) + " > %(outfile)s" 
        P.run()
    else:
        tmpfile = P.getTempFilename(".")
        tmpfile1 = tmpfile + ".fastq.1.gz"
        tmpfile2 = tmpfile + ".fastq.2.gz"

        E.warn( "processing first of pair")
        # first read pair
        statement = " | ".join( s ) + " > %(tmpfile1)s" 
        P.run()

        # second read pair        
        E.warn( "processing second of pair")
        infile = infile2
        statement = " | ".join( s ) + " > %(tmpfile2)s" 
        P.run()

        # reconcile
        E.info("starting reconciliation" )
        statement = """python %(scriptsdir)s/fastqs2fastqs.py
                           --method=reconcile
                           --output-pattern=%(track)s.fastq.%%i.gz
                           %(tmpfile1)s %(tmpfile2)s
                     > %(outfile)s_reconcile.log"""
        
        P.run()

        os.unlink( tmpfile1 )
        os.unlink( tmpfile2 )
        os.unlink( tmpfile )
コード例 #47
0
def processReads(infiles, outfile):
    '''process reads.'''

    infile, contaminant_file = infiles

    do_sth = False
    to_cluster = True

    infile2 = checkPairs(infile)

    if infile2:
        track = P.snip(outfile, ".fastq.1.gz")
        outfile2 = P.snip(outfile, ".fastq.1.gz") + ".fastq.2.gz"
    else:
        track = P.snip(outfile, ".fastq.gz")

    if PARAMS["process_combine_reads"]:
        E.warn(
            "combining reads cannot be can not be combined with other processing for paired ended reads"
        )
        if not infile2: raise IOError("must have paired data to combine reads")

        read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \
            PARAMS["combine_reads_fragment_length"], \
            PARAMS["combine_reads_fragment_length_stdev"]

        fragment_options = " ".join(map(str, [read_len, frag_len, frag_stdev]))

        if PARAMS["combine_reads_max_overlap"]:
            E.warn(
                "if specifying --max-overlap read and fragment length options will be ignored"
            )
            max_overlap = "--max-overlap=%i" % PARAMS[
                "combine_reads_max_overlap"]
            fragment_options = ""

        elif not PARAMS["combine_reads_max_overlap"] and len(
                fragment_options.strip().split(" ")) < 3:
            E.warn(
                "have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used"
            )
            max_overlap = ""
            fragment_options = ""

        elif PARAMS["combine_reads_read_length"] and PARAMS[
                "combine_reads_fragment_length"] and PARAMS[
                    "combine_reads_fragment_length_stdev"]:
            if PARAMS["combine_reads_max_overlap"]:
                E.warn(
                    "--max-overlap will override the specified read and fragment length options"
                )
            max_overlap = ""
            fragment_options = """--read-len=%(read_len)i
                                  --fragment-len=%(frag_len)i
                                  --fragment-len-stddev=%(frag_stdev)i""" % locals(
            )
        else:
            max_overlap = ""
            fragment_options = ""

        if not PARAMS["combine_reads_min_overlap"]:
            min_overlap = ""
        else:
            min_overlap = "--min-overlap=%i" % PARAMS[
                "combine_reads_min_overlap"]
        if not PARAMS["combine_reads_threads"]:
            threads = ""
        else:
            threads = "--threads=%i" % PARAMS["combine_reads_threads"]
        if not PARAMS["combine_reads_phred_offset"]:
            phred_offset = ""
        else:
            phred_offset = "--phred-offset=%i" % PARAMS[
                "combine_reads_phred_offset"]
        if not PARAMS["combine_reads_max_mismatch_density"]:
            max_mismatch_density = ""
        else:
            max_mismatch_density = "--max-mismatch-density=%f" % PARAMS[
                "combine_reads_max_mismatch_density"]

        statement = '''flash 
                     %(min_overlap)s
                     %(max_overlap)s
                     %(max_mismatch_density)s
                     %(phred_offset)s
                     %(fragment_options)s
                     --output-prefix=%(track)s
                     %(threads)s
                     --compress
                     %(infile)s %(infile2)s >> %(outfile)s.log
                     '''
        P.run()
        if PARAMS["combine_reads_concatenate"]:
            infiles = " ".join([
                track + x for x in [
                    ".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz",
                    ".extendedFrags.fastq.gz"
                ]
            ])
            statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s'''
        else:
            statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s'''
        P.run()
        return

    if PARAMS["process_sample"] and infile2:
        E.warn(
            "sampling can not be combined with other processing for paired ended reads"
        )
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat(IOTools.openFile(infile), raises=False)
    E.info("%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset(format, raises=False)

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
        #              %(contamination_trim_type)s
        s = [
            '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        '''
        ]
        do_sth = True
    else:
        s = ['zcat %(infile)s']

    if PARAMS["process_artifacts"]:
        s.append(
            'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log'
        )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append(
            'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log'
        )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append(
            'fastq_quality_trimmer -Q %(offset)i  -v %(trim_quality_options)s 2>> %(outfile)s_trim.log'
        )
        do_sth = True

    if PARAMS["process_filter"]:
        s.append(
            'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log'
        )
        do_sth = True

    if PARAMS["process_sample"]:
        s.append(
            'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log'
        )

    if not do_sth:
        E.warn("no filtering specified for %s - nothing done" % infile)
        return

    s.append("gzip")
    if not infile2:
        statement = " | ".join(s) + " > %(outfile)s"
        P.run()
    else:
        tmpfile = P.getTempFilename(".")
        tmpfile1 = tmpfile + ".fastq.1.gz"
        tmpfile2 = tmpfile + ".fastq.2.gz"

        E.warn("processing first of pair")
        # first read pair
        statement = " | ".join(s) + " > %(tmpfile1)s"
        P.run()

        # second read pair
        E.warn("processing second of pair")
        infile = infile2
        statement = " | ".join(s) + " > %(tmpfile2)s"
        P.run()

        # reconcile
        E.info("starting reconciliation")
        statement = """python %(scriptsdir)s/fastqs2fastqs.py
                           --method=reconcile
                           --output-pattern=%(track)s.fastq.%%s.gz
                           %(tmpfile1)s %(tmpfile2)s
                     > %(outfile)s_reconcile.log"""

        P.run()

        os.unlink(tmpfile1)
        os.unlink(tmpfile2)
        os.unlink(tmpfile)
コード例 #48
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-f",
        "--change-format",
        dest="change_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer'),
        help=
        "guess quality score format and set quality scores to format [default=%default]."
    )

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option("--sample",
                      dest="sample",
                      type="float",
                      help="sample a proportion of reads [default=%default].")

    parser.add_option("--pair",
                      dest="pair",
                      type="string",
                      help="if data is paired, filename with second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option("--outfile-pair",
                      dest="outfile_pair",
                      type="string",
                      help="if data is paired, filename for second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option(
        "--uniq",
        dest="uniq",
        action="store_true",
        help="remove duplicate reads (by name) [default=%default].")

    parser.add_option(
        "--apply",
        dest="apply",
        type="string",
        help=
        "apply a filter to fastq file (taking only reads in filename) [default=%default]."
    )

    parser.add_option("--trim3",
                      dest="trim3",
                      type="int",
                      help="trim # bases from 3' end [default=%default].")

    parser.add_option("--sort",
                      dest="sort",
                      action="store_true",
                      help="sort fastq by sequence id [default=%default].")

    parser.add_option(
        "--seed",
        dest="seed",
        type="int",
        help="seed for random number generator [default=%default].")

    parser.add_option(
        "--renumber-ids",
        dest="renumber_ids",
        type="string",
        help="rename reads in file by pattern [default=%default]")

    parser.set_defaults(change_format=None,
                        guess_format=None,
                        sample=None,
                        trim3=None,
                        pair=None,
                        apply=None,
                        uniq=False,
                        outfile_pair=None,
                        sort=None,
                        seed=None,
                        renumber_ids=None)

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    if options.change_format:
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.change_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.sample:
        sample_threshold = min(1.0, options.sample)

        random.seed(options.seed)

        if options.pair:
            if not options.outfile_pair:
                raise ValueError(
                    "please specify output filename for second pair (--outfile-pair)"
                )

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_pair, "w")

            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if random.random() <= sample_threshold:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.apply:
        ids = set(IOTools.readList(IOTools.openFile(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.trim3:
        trim3 = options.trim3
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.uniq:
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys: continue
            else: keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.sort:
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.outfile_pair:
                raise ValueError(
                    "please specify output filename for second pair (--outfile-pair)"
                )
            E.warn(
                "consider sorting individual fastq files - this is memory intensive"
            )
            entries1 = {}
            entries2 = {}
            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                entries1[record1.identifier[:-2]] = (record1.seq,
                                                     record1.quals)
                entries2[record2.identifier[:-2]] = (record2.seq,
                                                     record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_pair, "w")
            assert len(
                set(entries1.keys()).intersection(set(entries2.keys()))
            ) == len(entries1), """paired files do not contain the same reads
                                                                                                     need to reconcile files"""
            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.renumber_ids:
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_ids % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))

    ## write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
コード例 #49
0
ファイル: fastq2fastq.py プロジェクト: Charlie-George/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--change-format", dest="change_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
                      help="guess quality score format and set quality scores to format [default=%default].")

    parser.add_option("--guess-format", dest="guess_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
                      help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option("--sample", dest="sample", type="float",
                      help="sample a proportion of reads [default=%default].")

    parser.add_option("--pair", dest="pair", type="string",
                      help="if data is paired, filename with second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option("--outfile-pair", dest="outfile_pair", type="string",
                      help="if data is paired, filename for second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option("--uniq", dest="uniq", action="store_true",
                      help="remove duplicate reads (by name) [default=%default].")

    parser.add_option("--apply", dest="apply", type="string",
                      help="apply a filter to fastq file (taking only reads in filename) [default=%default].")

    parser.add_option("--trim3", dest="trim3", type="int",
                      help="trim # bases from 3' end [default=%default].")

    parser.add_option("--sort", dest="sort", action="store_true",
                      help="sort fastq by sequence id [default=%default].")

    parser.add_option("--seed", dest="seed", type="int",
                      help="seed for random number generator [default=%default].")

    parser.add_option("--renumber-ids", dest="renumber_ids", type="string",
                      help="rename reads in file by pattern [default=%default]")

    parser.set_defaults(
        change_format=None,
        guess_format=None,
        sample=None,
        trim3=None,
        pair=None,
        apply=None,
        uniq=False,
        outfile_pair=None,
        sort=None,
        seed=None,
        renumber_ids=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    if options.change_format:
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.change_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.sample:
        sample_threshold = min(1.0, options.sample)

        random.seed(options.seed)

        if options.pair:
            if not options.outfile_pair:
                raise ValueError(
                    "please specify output filename for second pair (--outfile-pair)")

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_pair, "w")

            for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if random.random() <= sample_threshold:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.apply:
        ids = set(IOTools.readList(IOTools.openFile(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.trim3:
        trim3 = options.trim3
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.uniq:
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.sort:
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.outfile_pair:
                raise ValueError(
                    "please specify output filename for second pair (--outfile-pair)")
            E.warn(
                "consider sorting individual fastq files - this is memory intensive")
            entries1 = {}
            entries2 = {}
            for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))):
                entries1[
                    record1.identifier[:-2]] = (record1.seq, record1.quals)
                entries2[
                    record2.identifier[:-2]] = (record2.seq, record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_pair, "w")
            assert len(set(entries1.keys()).intersection(set(entries2.keys()))) == len(entries1), """paired files do not contain the same reads
                                                                                                     need to reconcile files"""
            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.renumber_ids:
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_ids % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
コード例 #50
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p", "--bc-pattern", dest="pattern", type="string",
                      help="Barcode pattern. Ns are random bases X's fixed")
    parser.add_option("--read2-in", dest="read2_in", type="string",
                      help="file name for read pairs")
    parser.add_option("--3prime", dest="prime3", action="store_true",
                      help="barcode is on 3' end of read")
    parser.add_option("--read2-out", dest="read2_out", type="string",
                      help="file to output processed paired read to")
    parser.add_option("--supress-stats", dest="stats", action="store_false",
                      help="Suppress the writing of stats to the log")

    parser.set_defaults(pattern=None,
                        read2_in=None,
                        read2_out=None,
                        prime3=False,
                        stats=True)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = E.Start(parser, argv=argv)

    #Initialise the processor
    processor = Extractor(options.pattern, options.prime3)
    read1s = Fastq.iterate(options.stdin)

    if options.read2_in is None:

        for read in read1s:
            options.stdout.write(str(processor(read)) + "\n")

    else:
        
        read2s = Fastq.iterate(IOTools.openFile(options.read2_in))
        read2_out = IOTools.openFile(options.read2_out)

        for read1, read2 in zip(read1s, read2s):
            new_1, new_2 = processor(read1, read2)
            options.stdout.write(str(new_1) + "\n")
            read2_out.write(str(new_2) + "\n")

    # write footer and output benchmark information.

    if options.stats:
     
        options.stdlog.write("\t".join(["Barcode", "UMI", "Sample", "Count"]) + "\n")
        for id in processor.bc_count:
            options.stdlog.write("\t".join(id+(str(processor.bc_count[id]),)) + "\n")
                             
    E.Stop()