Python IOTools.readList Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: IOTools

Method/Function: readList

Examples at hotexamples.com: 15

CGAT.IOTools.ReadList is a Python function that allows users to read a list of items from a file or a stream of data. This function is particularly useful for reading lists stored in a plain text format, such as one item per line. It handles various file formats and can read data from standard input as well. By utilizing this function, programmers can efficiently extract the desired list of items from their input sources for further analysis or processing.

Python IOTools.readList - 15 examples found. These are the top rated real world Python examples of CGAT.IOTools.readList extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

openFile(30)

ReadMap(23)

ReadList(21)

isEmpty(14)

writeLines(9)

readMap(9)

which(8)

getInvertedDictionary(7)

readList(7)

prettyPercent(7)

zapFile(6)

convertDictionary(6)

snip(5)

FilePool(5)

iterate(5)

getNumLines(4)

readTable(4)

flatten(4)

readMultiMap(3)

str2val(3)

touchFile(3)

writeMatrix(3)

isComplete(2)

getLastLine(2)

readMatrix(2)

val2str(2)

human2bytes(1)

force_str(1)

cloneFile(1)

prettyFloat(1)

Example #1

Show file

File: pipeline_hvc.py Project: gsc0107/cgat

def computeOverlapGO(infile, outfile):
    '''compute overlap between codingmarkers and windows.
    Only markers of certain GO categories are counted.

    This is done by setting the gene_id and transcript_id of markers of the
    ENSEMBEL gene that it overlaps with. This list is filtered first to
    keep only those ids with valid GO associations
    
    '''

    to_cluster = False

    filter_goid = set(IOTools.readList(open(PARAMS["filename_gofilter"])))
    filter_genes = set()

    E.info("number of goids: %i" % len(filter_goid))

    for l in open(PARAMS["filename_go"]):
        f, id, goid, desc, evd = l[:-1].split("\t")[:5]
        if goid in filter_goid:
            filter_genes.add(id)

    tmpfile1 = P.getTempFile(dir=".")

    for line in open("ensembl.diff.genes_ovl"):

        a, b = line[:-1].split("\t")
        if b not in filter_genes: continue
        tmpfile1.write(line)

    E.info("number of genes taken: %i" % len(filter_genes))

    tmpfile1.close()
    tmpfilename1 = tmpfile1.name

    tmpfilename = P.getTempFilename(dir=".")

    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=%(tmpfilename1)s \
    < %(infile)s > %(tmpfilename)s
    '''

    P.run(**dict(locals().items() + PARAMS.items()))

    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run(**dict(locals().items() + PARAMS.items()))

    os.unlink(tmpfilename)

Example #2

Show file

File: pipeline_hvc.py Project: BioinformaticsArchive/cgat

def computeOverlapGO( infile, outfile ):
    '''compute overlap between codingmarkers and windows.
    Only markers of certain GO categories are counted.

    This is done by setting the gene_id and transcript_id of markers of the
    ENSEMBEL gene that it overlaps with. This list is filtered first to
    keep only those ids with valid GO associations
    
    '''
    
    to_cluster = False

    filter_goid = set(IOTools.readList( open( PARAMS["filename_gofilter"] ) ))
    filter_genes = set()

    E.info( "number of goids: %i" % len(filter_goid))
    
    for l in open( PARAMS["filename_go"]):
        f, id, goid, desc, evd = l[:-1].split("\t")[:5]
        if goid in filter_goid:
            filter_genes.add( id )

    tmpfile1 = P.getTempFile( dir = "." )

    for line in open("ensembl.diff.genes_ovl" ):

        a,b = line[:-1].split( "\t" )
        if b not in filter_genes: continue
        tmpfile1.write(line)
        
    E.info( "number of genes taken: %i" % len(filter_genes))
    
    tmpfile1.close()
    tmpfilename1 = tmpfile1.name

    tmpfilename = P.getTempFilename( dir = "." )
    
    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=%(tmpfilename1)s \
    < %(infile)s > %(tmpfilename)s
    '''
    
    P.run( **dict( locals().items() + PARAMS.items() ) )
    
    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run( **dict( locals().items() + PARAMS.items() ) )

    os.unlink( tmpfilename )

Example #3

Show file

File: fastq2fastq.py Project: Charlie-George/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--change-format", dest="change_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
                      help="guess quality score format and set quality scores to format [default=%default].")

    parser.add_option("--guess-format", dest="guess_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
                      help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option("--sample", dest="sample", type="float",
                      help="sample a proportion of reads [default=%default].")

    parser.add_option("--pair", dest="pair", type="string",
                      help="if data is paired, filename with second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option("--outfile-pair", dest="outfile_pair", type="string",
                      help="if data is paired, filename for second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option("--uniq", dest="uniq", action="store_true",
                      help="remove duplicate reads (by name) [default=%default].")

    parser.add_option("--apply", dest="apply", type="string",
                      help="apply a filter to fastq file (taking only reads in filename) [default=%default].")

    parser.add_option("--trim3", dest="trim3", type="int",
                      help="trim # bases from 3' end [default=%default].")

    parser.add_option("--sort", dest="sort", action="store_true",
                      help="sort fastq by sequence id [default=%default].")

    parser.add_option("--seed", dest="seed", type="int",
                      help="seed for random number generator [default=%default].")

    parser.add_option("--renumber-ids", dest="renumber_ids", type="string",
                      help="rename reads in file by pattern [default=%default]")

    parser.set_defaults(
        change_format=None,
        guess_format=None,
        sample=None,
        trim3=None,
        pair=None,
        apply=None,
        uniq=False,
        outfile_pair=None,
        sort=None,
        seed=None,
        renumber_ids=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    if options.change_format:
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.change_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.sample:
        sample_threshold = min(1.0, options.sample)

        random.seed(options.seed)

        if options.pair:
            if not options.outfile_pair:
                raise ValueError(
                    "please specify output filename for second pair (--outfile-pair)")

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_pair, "w")

            for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if random.random() <= sample_threshold:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.apply:
        ids = set(IOTools.readList(IOTools.openFile(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.trim3:
        trim3 = options.trim3
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.uniq:
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.sort:
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.outfile_pair:
                raise ValueError(
                    "please specify output filename for second pair (--outfile-pair)")
            E.warn(
                "consider sorting individual fastq files - this is memory intensive")
            entries1 = {}
            entries2 = {}
            for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))):
                entries1[
                    record1.identifier[:-2]] = (record1.seq, record1.quals)
                entries2[
                    record2.identifier[:-2]] = (record2.seq, record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_pair, "w")
            assert len(set(entries1.keys()).intersection(set(entries2.keys()))) == len(entries1), """paired files do not contain the same reads
                                                                                                     need to reconcile files"""
            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.renumber_ids:
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_ids % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()

Example #4

Show file

File: gtf2alleles.py Project: SCV/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")
    parser.add_option("-t", "--tablename", dest="tablename", type="string",
                      help="tablename to get variants from (in samtools pileup format) [default=%default].")
    parser.add_option("-d", "--database", dest="database", type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option("-f", "--exons-file", dest="filename_exons", type="string",
                      help="filename with transcript model information (gtf formatted file)  [default=%default].")
    parser.add_option("-r", "--filename-reference", dest="filename_reference", type="string",
                      help="filename with transcript models of a reference gene set. Stop codons that do not"
                      " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default].")
    parser.add_option("--vcf-file", dest="filename_vcf", type="string",
                      help="filename with variants in VCF format. Should be indexed by tabix  [default=%default].")
    parser.add_option("--pileup-file", dest="filename_pileup", type="string",
                      help="filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default].")
    parser.add_option("--vcf-sample", dest="vcf_sample", type="string",
                      help="sample id for species of interest in vcf formatted file [default=%default].")
    parser.add_option("-s", "--seleno-tsv-file", dest="filename_seleno", type="string",
                      help="filename of a list of transcript ids that are selenoproteins [default=%default].")
    parser.add_option("-m", "--module", dest="modules", type="choice", action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o", "--output-section", dest="output", type="choice", action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option("-k", "--with-knockouts", dest="with_knockouts", action="store_true",
                      help="add alleles that are knocked out to fasta and gtf files [default=%default].")

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(
            options.database, options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(
            options.filename_vcf, options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(
            ("gene_id",
             "transcript_id", "allele_id", "contig", "strand",
             "is_wildtype",
             ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print "# collected variants:", variants

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print "# merged variants:", variants

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(
                indexed_variants, all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print "exon", key
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print "intron", key
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] + variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(transcript,
                                   variant_exons,
                                   variant_introns,
                                   all_exons,
                                   all_introns,
                                   offsets,
                                   is_seleno=transcript_id in seleno,
                                   reference_coordinates=False,
                                   )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(allele.exon_starts[1:],
                                                                allele.cds_starts[
                                                                    1:],
                                                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(
                            ">%s\n%s\n" % (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig - allele.reference_first_stop_start, )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % (
                        "\t".join((gene_id,
                                   transcript_id,
                                   allele_id,
                                   contig,
                                   strand,
                                   "%i" % is_wildtype)),
                        "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()

Example #5

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")
    parser.add_option(
        "-t",
        "--tablename",
        dest="tablename",
        type="string",
        help=
        "tablename to get variants from (in samtools pileup format) [default=%default]."
    )
    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option(
        "-f",
        "--exons-file",
        dest="filename_exons",
        type="string",
        help=
        "filename with transcript model information (gtf formatted file)  [default=%default]."
    )
    parser.add_option(
        "-r",
        "--filename-reference",
        dest="filename_reference",
        type="string",
        help=
        "filename with transcript models of a reference gene set. Stop codons that do not"
        " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default]."
    )
    parser.add_option(
        "--vcf-file",
        dest="filename_vcf",
        type="string",
        help=
        "filename with variants in VCF format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--pileup-file",
        dest="filename_pileup",
        type="string",
        help=
        "filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--vcf-sample",
        dest="vcf_sample",
        type="string",
        help=
        "sample id for species of interest in vcf formatted file [default=%default]."
    )
    parser.add_option(
        "-s",
        "--seleno-tsv-file",
        dest="filename_seleno",
        type="string",
        help=
        "filename of a list of transcript ids that are selenoproteins [default=%default]."
    )
    parser.add_option("-m",
                      "--module",
                      dest="modules",
                      type="choice",
                      action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option(
        "-k",
        "--with-knockouts",
        dest="with_knockouts",
        action="store_true",
        help=
        "add alleles that are knocked out to fasta and gtf files [default=%default]."
    )

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(options.database,
                                             options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(options.filename_vcf,
                                          options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(("gene_id", "transcript_id",
                                         "allele_id", "contig", "strand",
                                         "is_wildtype",
                                         ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# collected variants:", variants)

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# merged variants:", variants)

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(indexed_variants,
                                                    all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print("exon", key)
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print("intron", key)
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] +
                        variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(
                transcript,
                variant_exons,
                variant_introns,
                all_exons,
                all_introns,
                offsets,
                is_seleno=transcript_id in seleno,
                reference_coordinates=False,
            )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(
                                allele.exon_starts[1:], allele.cds_starts[1:],
                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(">%s\n%s\n" %
                                               (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig -
                        allele.reference_first_stop_start,
                    )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % ("\t".join(
                        (gene_id, transcript_id, allele_id, contig, strand,
                         "%i" % is_wildtype)), "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()

Example #6

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: set_diff.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percentage information to each line.")

    parser.add_option(
        "-t",
        "--header-names",
        dest="headers",
        type="string",
        help=
        "comma separated list of headers. If empty or set to '-', filenames are used."
    )

    parser.add_option("--skip-header",
                      dest="add_header",
                      action="store_false",
                      help="do not add header to flat format.")

    parser.add_option("--output-with-header",
                      dest="write_header",
                      action="store_true",
                      help="write header and exit.")

    parser.add_option("--with-title",
                      dest="with_title",
                      action="store_true",
                      help="use column titles in input data [%default].")

    parser.add_option("--no-title",
                      dest="with_title",
                      action="store_false",
                      help="there are no titles in input data [%default].")

    parser.set_defaults(
        add_percent=False,
        percent_format="%5.2f",
        headers=None,
        add_header=True,
        write_header=False,
        with_title=True,
    )

    (options, args) = E.Start(parser)

    if options.add_header:
        options.stdout.write(
            "set1\tset2\tn1\tn2\tunion\tinter\tunique1\tunique2")
        if options.add_percent:
            options.stdout.write(
                "\tpinter\tpunique1\tpunique2\tpcov1\tpcov2\tpcovmax")
        options.stdout.write("\n")

        if options.write_header:
            sys.exit(0)

    if len(args) < 2:
        raise ValueError("please supply at least two filenames.")

    headers, titles, sets = [], [], []

    if options.headers:
        if options.headers == "-":
            headers = args
        else:
            headers = options.headers.split(",")
            if len(headers) != len(args):
                raise ValueError(
                    "please supply the same number of headers as there are filenames."
                )

    for f in args:
        if options.with_title:
            title, data = IOTools.readList(IOTools.openFile(f, "r"),
                                           with_title=options.with_title)
            titles.append(title)
        else:
            data = IOTools.readList(open(f, "r"))
        sets.append(set(data))

    if not headers and titles:
        headers = titles
    else:
        headers = args

    for x in range(len(sets) - 1):
        set1 = sets[x]

        for y in range(x + 1, len(sets)):
            set2 = sets[y]
            l1, l2 = len(set1), len(set2)
            options.stdout.write(
                "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i" %
                (headers[x], headers[y], l1, l2, len(set1.union(set2)),
                 len(set1.intersection(set2)), len(
                     set1.difference(set2)), len(set2.difference(set1))))

            if options.add_percent:
                if len(set1) == 0:
                    ri, r1, r2 = 0, 1, 0
                    c1, c2, cm = 1, 0, 0
                elif len(set2) == 0:
                    ri, r1, r2 = 0, 0, 1
                    c1, c2, cm = 0, 1, 0
                else:
                    i = len(set1.intersection(set2))
                    ri, r1, r2 = (i / float(len(set1.union(set2))),
                                  len(set1.difference(set2)) / float(l1),
                                  len(set2.difference(set1)) / float(l2))
                    c1, c2 = (i / float(l1), i / float(l2))
                    cm = max(c1, c2)

                options.stdout.write(
                    "\t" +
                    ("\t".join([options.percent_format for z in range(6)])) %
                    (ri, r1, r2, c1, c2, cm))

            options.stdout.write("\n")

    E.Stop()

Example #7

Show file

File: gff2annotator.py Project: siping/cgat

                for contig, gffs in GTF.readAsIntervals( iterator ).items():
                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    for x in gffs:
                        options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0],x[1] ) )
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) )
                E.info( "set %s annotated with %i segments" % (filename, nsegments - start) )

            else:
                ## create subsets
                E.debug("applying subsets for %s" % filename )
                geneid2label, label2segments = collections.defaultdict(list) , {}
                for label, filename_ids in options.subsets[filename]:
                    gene_ids = IOTools.readList( open(filename_ids, "r") )
                    for gene_id in gene_ids: geneid2label[gene_id].append( label )
                    label2segments[label] = []

                for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items():

                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    for start, end, gene_id in gffs:
                        if gene_id not in geneid2label: continue
                        for label in geneid2label[gene_id]:
                            label2segments[label].append(nsegments)
                            
                        options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                        nsegments += 1

Example #8

Show file

File: set_diff.py Project: Charlie-George/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: set_diff.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true",
                      help="add percentage information to each line.")

    parser.add_option("-t", "--headers", dest="headers", type="string",
                      help="comma separated list of headers. If empty or set to '-', filenames are used.")

    parser.add_option("--skip-header", dest="add_header", action="store_false",
                      help="do not add header to flat format.")

    parser.add_option("--write-header", dest="write_header", action="store_true",
                      help="write header and exit.")

    parser.add_option("--with-title", dest="with_title", action="store_true",
                      help="use column titles in input data [%default].")

    parser.add_option("--no-title", dest="with_title", action="store_false",
                      help="there are no titles in input data [%default].")

    parser.set_defaults(
        add_percent=False,
        percent_format="%5.2f",
        headers=None,
        add_header=True,
        write_header=False,
        with_title=True,
    )

    (options, args) = E.Start(parser)

    if options.add_header:
        options.stdout.write(
            "set1\tset2\tn1\tn2\tunion\tinter\tunique1\tunique2")
        if options.add_percent:
            options.stdout.write(
                "\tpinter\tpunique1\tpunique2\tpcov1\tpcov2\tpcovmax")
        options.stdout.write("\n")

        if options.write_header:
            sys.exit(0)

    if len(args) < 2:
        raise ValueError("please supply at least two filenames.")

    headers, titles, sets = [], [], []

    if options.headers:
        if options.headers == "-":
            headers = args
        else:
            headers = options.headers.split(",")
            if len(headers) != len(args):
                raise ValueError(
                    "please supply the same number of headers as there are filenames.")

    for f in args:
        if options.with_title:
            title, data = IOTools.readList(
                open(f, "r"), with_title=options.with_title)
            titles.append(title)
        else:
            data = IOTools.readList(open(f, "r"))
        sets.append(set(data))

    if not headers and titles:
        headers = titles
    else:
        headers = args

    for x in range(len(sets) - 1):
        set1 = sets[x]

        for y in range(x + 1, len(sets)):
            set2 = sets[y]
            l1, l2 = len(set1), len(set2)
            options.stdout.write("%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i" % (headers[x], headers[y],
                                                                     l1, l2,
                                                                     len(set1.union(
                                                                         set2)),
                                                                     len(set1.intersection(
                                                                         set2)),
                                                                     len(set1.difference(
                                                                         set2)),
                                                                     len(set2.difference(set1))))

            if options.add_percent:
                if len(set1) == 0:
                    ri, r1, r2 = 0, 1, 0
                    c1, c2, cm = 1, 0, 0
                elif len(set2) == 0:
                    ri, r1, r2 = 0, 0, 1
                    c1, c2, cm = 0, 1, 0
                else:
                    i = len(set1.intersection(set2))
                    ri, r1, r2 = (
                        i / float(len(set1.union(set2))),
                        len(set1.difference(set2)) / float(l1),
                        len(set2.difference(set1)) / float(l2))
                    c1, c2 = (i / float(l1), i / float(l2))
                    cm = max(c1, c2)

                options.stdout.write(
                    "\t" + ("\t".join([options.percent_format for z in range(6)])) % (ri, r1, r2, c1, c2, cm))

            options.stdout.write("\n")

    E.Stop()

Example #9

Show file

File: fastq2fastq.py Project: Q-KIM/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "apply",
                          "change-format",
                          "renumber-reads",
                          "sample",
                          "sort",
                          "trim3",
                          "trim5",
                          "unique",
                          "grep"),
                      help="method to apply [%default]")

    parser.add_option(
        "--target-format", dest="target_format", type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="guess quality score format and set quality scores "
        "to format [default=%default].")

    parser.add_option(
        "--guess-format", dest="guess_format", type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option(
        "--sample-size", dest="sample_size", type="float",
        help="proportion of reads to sample. "
        "Provide a proportion of reads to sample, e.g. 0.1 for 10%, "
        "0.5 for 50%, etc [default=%default].")

    parser.add_option(
        "--pair-fastq-file", dest="pair", type="string",
        help="if data is paired, filename with second pair. "
        "Implemented for sampling [default=%default].")

    parser.add_option(
        "--map-tsv-file", dest="map_tsv_file", type="string",
        help="filename with tab-separated identifiers mapping for "
        "method apply [default=%default].")

    parser.add_option(
        "--num-bases", dest="nbases", type="int",
        help="number of bases to trim [default=%default].")

    parser.add_option(
        "--seed", dest="seed", type="int",
        help="seed for random number generator [default=%default].")

    parser.add_option(
        "--pattern-identifier", dest="renumber_pattern", type="string",
        help="rename reads in file by pattern [default=%default]")

    parser.add_option(
        "--grep-pattern", dest="grep_pattern", type="string",
        help="subset to reads matching pattern [default=%default]")

    parser.set_defaults(
        method=None,
        change_format=None,
        guess_format=None,
        sample_size=0.1,
        nbases=0,
        pair=None,
        apply=None,
        seed=None,
        renumber_pattern="read_%010i",
        grep_pattern=".*")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    c = E.Counter()

    if options.method == "change-format":
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.target_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "grep":
        for record in Fastq.iterate(options.stdin):
            if re.match(options.grep_pattern, record.seq):
                options.stdout.write("%s\n" % record)

    elif options.method == "sample":
        sample_threshold = min(1.0, options.sample_size)

        random.seed(options.seed)

        if options.pair:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename pattern for "
                    "second pair (--output-filename-pattern)")

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_filename_pattern, "w")

            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if random.random() <= sample_threshold:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "apply":
        ids = set(IOTools.readList(IOTools.openFile(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "trim3":
        trim3 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "trim5":
        trim5 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim5(trim5)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "unique":
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.method == "sort":
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename for second pair "
                    "(--output-filename-pattern)")
            E.warn(
                "consider sorting individual fastq files - "
                "this is memory intensive")
            entries1 = {}
            entries2 = {}

            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                entries1[
                    record1.identifier[:-2]] = (record1.seq, record1.quals)
                entries2[
                    record2.identifier[:-2]] = (record2.seq, record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.output_filename_pattern, "w")
            assert len(set(entries1.keys()).intersection(
                set(entries2.keys()))) == len(entries1),\
                "paired files do not contain the same reads "\
                "need to reconcile files"

            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.method == "renumber-reads":
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_pattern % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()

Example #10

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-f",
        "--change-format",
        dest="change_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer'),
        help=
        "guess quality score format and set quality scores to format [default=%default]."
    )

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option("--sample",
                      dest="sample",
                      type="float",
                      help="sample a proportion of reads [default=%default].")

    parser.add_option("--pair",
                      dest="pair",
                      type="string",
                      help="if data is paired, filename with second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option("--outfile-pair",
                      dest="outfile_pair",
                      type="string",
                      help="if data is paired, filename for second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option(
        "--uniq",
        dest="uniq",
        action="store_true",
        help="remove duplicate reads (by name) [default=%default].")

    parser.add_option(
        "--apply",
        dest="apply",
        type="string",
        help=
        "apply a filter to fastq file (taking only reads in filename) [default=%default]."
    )

    parser.add_option("--trim3",
                      dest="trim3",
                      type="int",
                      help="trim # bases from 3' end [default=%default].")

    parser.add_option("--sort",
                      dest="sort",
                      action="store_true",
                      help="sort fastq by sequence id [default=%default].")

    parser.add_option(
        "--seed",
        dest="seed",
        type="int",
        help="seed for random number generator [default=%default].")

    parser.add_option(
        "--renumber-ids",
        dest="renumber_ids",
        type="string",
        help="rename reads in file by pattern [default=%default]")

    parser.set_defaults(change_format=None,
                        guess_format=None,
                        sample=None,
                        trim3=None,
                        pair=None,
                        apply=None,
                        uniq=False,
                        outfile_pair=None,
                        sort=None,
                        seed=None,
                        renumber_ids=None)

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    if options.change_format:
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.change_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.sample:
        sample_threshold = min(1.0, options.sample)

        random.seed(options.seed)

        if options.pair:
            if not options.outfile_pair:
                raise ValueError(
                    "please specify output filename for second pair (--outfile-pair)"
                )

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_pair, "w")

            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if random.random() <= sample_threshold:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.apply:
        ids = set(IOTools.readList(IOTools.openFile(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.trim3:
        trim3 = options.trim3
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.uniq:
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys: continue
            else: keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.sort:
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.outfile_pair:
                raise ValueError(
                    "please specify output filename for second pair (--outfile-pair)"
                )
            E.warn(
                "consider sorting individual fastq files - this is memory intensive"
            )
            entries1 = {}
            entries2 = {}
            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                entries1[record1.identifier[:-2]] = (record1.seq,
                                                     record1.quals)
                entries2[record2.identifier[:-2]] = (record2.seq,
                                                     record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_pair, "w")
            assert len(
                set(entries1.keys()).intersection(set(entries2.keys()))
            ) == len(entries1), """paired files do not contain the same reads
                                                                                                     need to reconcile files"""
            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.renumber_ids:
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_ids % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))

    ## write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()

Example #11

Show file

File: gff2annotator.py Project: lesheng/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2annotator2tsv.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="string",
                      help="feature to collect [default=None].")

    parser.add_option("-i",
                      "--files",
                      dest="files",
                      action="append",
                      help="use multiple annotations [default=None].")

    parser.add_option(
        "-a",
        "--annotations",
        dest="annotations",
        type="string",
        help=
        "aggregate name for annotations if only single file is provided from STDIN [default=None]."
    )

    parser.add_option(
        "--input-filename-map",
        dest="input_filename_map",
        type="string",
        help="filename with a map of gene_ids to categories [default=None].")

    parser.add_option(
        "--output-filename-synonyms",
        dest="output_filename_synonyms",
        type="string",
        help=
        "output filename for synonyms. For workspace building, the gff source will be used as the id (instead of the contig) [default=None]."
    )

    parser.add_option("-m",
                      "--max-length",
                      dest="max_length",
                      type="string",
                      help="maximum segment length [default=None].")

    parser.add_option("-s",
                      "--section",
                      dest="section",
                      type="choice",
                      choices=("segments", "annotations", "annotations-genes",
                               "annotations-go", "workspace",
                               "annotations-gff"),
                      help="annotator section [default=None].")

    parser.add_option(
        "--subset",
        dest="subsets",
        type="string",
        action="append",
        help=
        "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."
    )

    parser.add_option(
        "--remove-regex",
        dest="remove_regex",
        type="string",
        help="regular expression of contigs to remove [default=None].")

    parser.set_defaults(
        genome_file=None,
        feature=None,
        section="segments",
        annotations="annotations",
        max_length=100000,
        files=[],
        subsets=[],
        input_filename_map=None,
        output_filename_synonyms=None,
        input_format="gff",
        remove_regex=None,
    )

    (options, args) = E.Start(parser)

    options.files += args
    if len(options.files) == 0:
        options.files.append("-")
    options.files = list(
        itertools.chain(*[re.split("[,; ]+", x) for x in options.files]))

    if options.subsets:
        subsets = collections.defaultdict(list)
        for s in options.subsets:
            filename_gff, label, filename_ids = s.split(",")
            subsets[filename_gff].append((label, filename_ids))
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section.startswith("annotations"):
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)

    ninput, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0

    if options.remove_regex:
        options.remove_regex = re.compile(options.remove_regex)

    if options.section in ("segments", "workspace"):

        iterator = GTF.iterator_filtered(GFF.iterator(options.stdin),
                                         feature=options.feature)

        if options.output_filename_synonyms:
            outfile_synonyms = open(options.output_filename_synonyms, "w")
            with_records = True
        else:
            outfile_synonyms = None
            with_records = False

        intervals = GTF.readAsIntervals(iterator, with_records=with_records)
        ninput, nsegments, ndiscarded, ncontigs = \
            PipelineEnrichment.outputSegments(options.stdout,
                                              intervals,
                                              options.section,
                                              outfile_synonyms=outfile_synonyms,
                                              max_length=options.max_length,
                                              remove_regex=options.remove_regex)

        if outfile_synonyms:
            outfile_synonyms.close()

    elif options.section == "annotations-go":

        assert options.input_filename_map, "please supply option --input-filename-map"

        iterator = GTF.iterator_filtered(GTF.iterator(options.stdin),
                                         feature=options.feature)

        geneid2categories = IOTools.readMultiMap(
            open(options.input_filename_map, "r"))

        category2segments = collections.defaultdict(list)

        for contig, gffs in GTF.readAsIntervals(iterator,
                                                with_gene_id=True).items():
            if options.remove_regex and options.remove_regex.search(contig):
                continue

            for start, end, geneid in gffs:
                if geneid not in geneid2categories:
                    continue
                for category in geneid2categories[geneid]:
                    category2segments[category].append(nsegments)

                options.stdout.write("%s\t%i\t%s\t(%i,%i)\n" %
                                     (prefix, nsegments, contig, start, end))
                nsegments += 1

        for category, segments in category2segments.iteritems():
            options.stdout.write(
                "##Ann\t%s\t%s\n" %
                (category, "\t".join(["%i" % x for x in segments])))
            E.info("set %s annotated with %i segments" %
                   (category, len(segments)))

    elif options.section == "annotations":

        for filename in options.files:

            E.info("adding filename %s" % filename)

            start = nsegments
            is_gtf = False

            if filename == "-":
                iterator = GTF.iterator_filtered(GFF.iterator(sys.stdin),
                                                 feature=options.feature)
                filename = options.annotations
            elif filename.endswith(".gtf"):
                is_gtf = True
                with open(filename, "r") as infile:
                    iterator = GTF.iterator_filtered(GTF.iterator(infile),
                                                     feature=options.feature)

            else:
                with open(filename, "r") as infile:
                    iterator = GTF.iterator_filtered(GFF.iterator(infile),
                                                     feature=options.feature)

            E.debug("processing %s" % (filename))

            if not options.subsets or filename not in options.subsets:
                for contig, gffs in GTF.readAsIntervals(iterator).items():
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for x in gffs:
                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, x[0], x[1]))
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join(
                    ["%i" % x for x in range(start, nsegments)])))
                E.info("set %s annotated with %i segments" %
                       (filename, nsegments - start))

            else:
                raise ValueError("don't know how to filter %s" % filename)

    elif options.section == "annotations-gff":

        for filename in options.files:
            if filename == "-":
                iterator = GTF.iterator(sys.stdin)
            else:
                iterator = GTF.iterator_filtered(
                    GFF.iterator(open(filename, "r")))

            segments = collections.defaultdict(list)
            for gff in iterator:
                segments[":".join((gff.source, gff.feature))].append(
                    (gff.contig, gff.start, gff.end))

            feature2segments = {}

            for feature, s in segments.iteritems():
                s.sort()

                s1 = nsegments

                for contig, start, end in s:
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    options.stdout.write(
                        "%s\t%i\t%s\t(%i,%i)\n" %
                        (prefix, nsegments, contig, start, end))
                    nsegments += 1

                feature2segments[feature] = (s1, nsegments)

        for feature, id_range in feature2segments.iteritems():
            start, end = id_range
            options.stdout.write(
                "##Ann\t%s\t%s\n" %
                (feature, "\t".join(["%i" % x for x in xrange(start, end)])))
            E.info("set %s annotated with %i segments" %
                   (feature, end - start))

    elif options.section == "annotations-genes":

        for filename in options.files:

            E.info("adding filename %s" % filename)

            start = nsegments

            assert filename.endswith(".gtf") or filename.endswith(".gtf.gz"), \
                "requiring .gtf files for gene list filtering, received %s" % filename

            infile = IOTools.openFile(filename)
            iterator = GTF.iterator_filtered(GTF.iterator(infile),
                                             feature=options.feature)

            E.debug("processing %s" % (filename))

            if not options.subsets or filename not in options.subsets:
                # output all
                for contig, gffs in GTF.readAsIntervals(iterator).items():
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for x in gffs:
                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, x[0], x[1]))
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join(
                    ["%i" % x for x in range(start, nsegments)])))
                E.info("set %s annotated with %i segments" %
                       (filename, nsegments - start))

            else:
                # create subsets
                E.debug("applying subsets for %s" % filename)
                geneid2label, label2segments = collections.defaultdict(
                    list), {}
                for label, filename_ids in options.subsets[filename]:
                    gene_ids = IOTools.readList(open(filename_ids, "r"))
                    for gene_id in gene_ids:
                        geneid2label[gene_id].append(label)
                    label2segments[label] = []

                for contig, gffs in GTF.readAsIntervals(
                        iterator, with_gene_id=True).items():

                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for start, end, gene_id in gffs:
                        if gene_id not in geneid2label:
                            continue
                        for label in geneid2label[gene_id]:
                            label2segments[label].append(nsegments)

                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, start, end))
                        nsegments += 1

                for label, segments in label2segments.iteritems():
                    options.stdout.write(
                        "##Ann\t%s\t%s\n" %
                        (label, "\t".join(["%i" % x for x in segments])))
                    E.info("set %s (%s) annotated with %i segments" %
                           (label, filename, len(segments)))

    E.info("ninput=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" %
           (ninput, ncontigs, nsegments, ndiscarded))

    E.Stop()

Example #12

Show file

File: fastq2fastq.py Project: mint1234/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("apply", "change-format", "renumber-reads",
                               "sample", "sort", "trim3", "trim5", "unique",
                               "grep"),
                      help="method to apply [%default]")

    parser.add_option("--target-format",
                      dest="target_format",
                      type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer',
                               'illumina-1.8'),
                      help="guess quality score format and set quality scores "
                      "to format [default=%default].")

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option(
        "--sample-size",
        dest="sample_size",
        type="float",
        help="proportion of reads to sample. "
        "Provide a proportion of reads to sample, e.g. 0.1 for 10%, "
        "0.5 for 50%, etc [default=%default].")

    parser.add_option("--pair-fastq-file",
                      dest="pair",
                      type="string",
                      help="if data is paired, filename with second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option(
        "--map-tsv-file",
        dest="map_tsv_file",
        type="string",
        help="filename with tab-separated identifiers mapping for "
        "method apply [default=%default].")

    parser.add_option("--num-bases",
                      dest="nbases",
                      type="int",
                      help="number of bases to trim [default=%default].")

    parser.add_option(
        "--seed",
        dest="seed",
        type="int",
        help="seed for random number generator [default=%default].")

    parser.add_option(
        "--pattern-identifier",
        dest="renumber_pattern",
        type="string",
        help="rename reads in file by pattern [default=%default]")

    parser.add_option(
        "--grep-pattern",
        dest="grep_pattern",
        type="string",
        help="subset to reads matching pattern [default=%default]")

    parser.set_defaults(method=None,
                        change_format=None,
                        guess_format=None,
                        sample_size=0.1,
                        nbases=0,
                        pair=None,
                        apply=None,
                        seed=None,
                        renumber_pattern="read_%010i",
                        grep_pattern=".*")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    c = E.Counter()

    if options.method == "change-format":
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.target_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "grep":
        for record in Fastq.iterate(options.stdin):
            if re.match(options.grep_pattern, record.seq):
                options.stdout.write("%s\n" % record)

    elif options.method == "sample":
        sample_threshold = min(1.0, options.sample_size)

        random.seed(options.seed)

        if options.pair:
            if not options.output_filename_pattern:
                raise ValueError("please specify output filename pattern for "
                                 "second pair (--output-filename-pattern)")

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.output_filename_pattern, "w")

            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)

        else:
            for record in Fastq.iterate(options.stdin):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    options.stdout.write("%s\n" % record)

    elif options.method == "apply":
        ids = set(IOTools.readList(IOTools.openFile(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "trim3":
        trim3 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "trim5":
        trim5 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim5(trim5)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "unique":
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.method == "sort":
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename for second pair "
                    "(--output-filename-pattern)")
            E.warn("consider sorting individual fastq files - "
                   "this is memory intensive")
            entries1 = {}
            entries2 = {}

            for record1, record2 in itertools.izip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                entries1[record1.identifier[:-2]] = (record1.seq,
                                                     record1.quals)
                entries2[record2.identifier[:-2]] = (record2.seq,
                                                     record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.output_filename_pattern, "w")
            assert len(set(entries1.keys()).intersection(
                set(entries2.keys()))) == len(entries1),\
                "paired files do not contain the same reads "\
                "need to reconcile files"

            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.method == "renumber-reads":
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_pattern % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()

Example #13

Show file

File: gff2annotator.py Project: BioinformaticsArchive/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gff2annotator2tsv.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"])

        
    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genome."  )

    parser.add_option( "-f", "--features", dest="features", type="string", 
                       help="feature to collect [default=None]."  )

    parser.add_option( "-i", "--files", dest="files", action="append",
                       help="use multiple annotations [default=None]."  )

    parser.add_option(  "-a", "--annotations", dest="annotations", type="string", 
                       help="aggregate name for annotations if only single file is provided from STDIN [default=None]."  )

    parser.add_option(  "--input-filename-map", dest="input_filename_map", type="string", 
                       help="filename with a map of gene_ids to categories [default=None]."  )

    parser.add_option(  "--output-filename-synonyms", dest="output_filename_synonyms", type="string", 
                       help="output filename for synonyms. For workspace building, the gff source will be used as the id (instead of the contig) [default=None]."  )

    parser.add_option( "-m", "--max-length", dest="max_length", type="string", 
                       help="maximum segment length [default=None]."  )

    parser.add_option( "-s", "--section", dest="section", type="choice", 
                       choices=("segments", "annotations", "annotations-genes", "annotations-go", "workspace", "annotations-gff" ),
                       help="annotator section [default=None]."  )

    parser.add_option( "--subset", dest="subsets", type="string", action="append",
                       help="add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."  )

    parser.add_option( "--remove-regex", dest="remove_regex", type="string", 
                       help="regular expression of contigs to remove [default=None]."  )

    parser.set_defaults(
        genome_file = None,
        feature = None,
        section = "segments",
        annotations = "annotations",
        max_length = 100000,
        files = [],
        subsets = [],
        input_filename_map = None,
        output_filename_synonyms = None,
        input_format = "gff",
        remove_regex = None,
        )

    (options, args) = E.Start( parser )

    options.files += args
    if len(options.files) == 0: options.files.append("-")
    options.files = list( itertools.chain( *[ re.split( "[,; ]+", x) for x in options.files ] ) )

    if options.subsets:
        subsets = collections.defaultdict( list )
        for s in options.subsets: 
            filename_gff,label,filename_ids = s.split( "," )
            subsets[filename_gff].append( (label,filename_ids) )
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section.startswith( "annotations" ):
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)
        
    ninput, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0

    if options.remove_regex:
        options.remove_regex = re.compile( options.remove_regex )

    if options.section in ("segments", "workspace"):

        iterator = GTF.iterator_filtered( GFF.iterator( options.stdin ),
                                          feature=options.feature )

        if options.output_filename_synonyms:
            outfile_synonyms = open(options.output_filename_synonyms, "w")
            with_records = True
        else:
            outfile_synonyms = None
            with_records = False

        intervals =GTF.readAsIntervals( iterator, with_records = with_records )
        ninput, nsegments, ndiscarded, ncontigs = \
            PipelineEnrichment.outputSegments( options.stdout,
                                               intervals,
                                               options.section,
                                               outfile_synonyms = outfile_synonyms,
                                               max_length = options.max_length,
                                               remove_regex = options.remove_regex )
            
        if outfile_synonyms:
            outfile_synonyms.close()

    elif options.section == "annotations-go":

        assert options.input_filename_map, "please supply option --input-filename-map" 

        iterator = GTF.iterator_filtered( GTF.iterator( options.stdin ),
                                          feature=options.feature )

        geneid2categories = IOTools.readMultiMap( open( options.input_filename_map, "r") )

        category2segments = collections.defaultdict( list )

        for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items():
            if options.remove_regex and options.remove_regex.search( contig ): continue
            
            for start, end, geneid in gffs:
                if geneid not in geneid2categories: continue
                for category in geneid2categories[geneid]:
                    category2segments[category].append(nsegments)

                options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                nsegments += 1                        
            
        for category, segments in category2segments.iteritems():
            options.stdout.write("##Ann\t%s\t%s\n" % (category, "\t".join( ["%i" % x for x in segments ] ) ) )
            E.info( "set %s annotated with %i segments" % (category, len(segments)) )

    elif options.section == "annotations":

        for filename in options.files:

            E.info( "adding filename %s" % filename )

            start = nsegments
            is_gtf = False

            if filename == "-":
                iterator = GTF.iterator_filtered( GFF.iterator( sys.stdin ),
                                                  feature=options.feature )
                filename = options.annotations
            elif filename.endswith(".gtf"):
                is_gtf = True
                with open( filename, "r") as infile:
                    iterator = GTF.iterator_filtered( GTF.iterator( infile ),
                                                      feature=options.feature )
                
            else:
                with open( filename, "r") as infile:
                    iterator = GTF.iterator_filtered( GFF.iterator( infile ),
                                                      feature=options.feature )
           
            E.debug("processing %s" % (filename))

            if not options.subsets or filename not in options.subsets:
                for contig, gffs in GTF.readAsIntervals( iterator ).items():
                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    for x in gffs:
                        options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0], x[1] ) )
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) )
                E.info( "set %s annotated with %i segments" % (filename, nsegments - start) )

            else:
                raise ValueError("don't know how to filter %s" % filename )

    elif options.section == "annotations-gff":

        for filename in options.files:
            if filename == "-":
                iterator = GTF.iterator( sys.stdin )
            else:
                iterator = GTF.iterator_filtered( GFF.iterator( open( filename, "r") ) )

            segments = collections.defaultdict( list )
            for gff in iterator:
                segments[":".join((gff.source,gff.feature))].append( (gff.contig,gff.start, gff.end) )
  
            feature2segments = {}

            for feature, s in segments.iteritems():
                s.sort()

                s1 = nsegments

                for contig, start, end in s:
                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                    nsegments += 1

                feature2segments[feature] = (s1, nsegments)
            
        for feature, id_range in feature2segments.iteritems():
            start, end = id_range
            options.stdout.write("##Ann\t%s\t%s\n" % (feature, "\t".join( ["%i" % x for x in xrange( start,end) ] ) ) )
            E.info( "set %s annotated with %i segments" % (feature, end-start) )

    elif options.section == "annotations-genes":

        for filename in options.files:

            E.info( "adding filename %s" % filename )

            start = nsegments

            assert filename.endswith(".gtf") or filename.endswith(".gtf.gz"), \
                "requiring .gtf files for gene list filtering, received %s" % filename

            infile = IOTools.openFile( filename )
            iterator = GTF.iterator_filtered( GTF.iterator( infile ),
                                              feature=options.feature )
                
            E.debug("processing %s" % (filename))
            
            if not options.subsets or filename not in options.subsets:
                ## output all
                for contig, gffs in GTF.readAsIntervals( iterator ).items():
                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    for x in gffs:
                        options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0],x[1] ) )
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) )
                E.info( "set %s annotated with %i segments" % (filename, nsegments - start) )

            else:
                ## create subsets
                E.debug("applying subsets for %s" % filename )
                geneid2label, label2segments = collections.defaultdict(list) , {}
                for label, filename_ids in options.subsets[filename]:
                    gene_ids = IOTools.readList( open(filename_ids, "r") )
                    for gene_id in gene_ids: geneid2label[gene_id].append( label )
                    label2segments[label] = []

                for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items():

                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    for start, end, gene_id in gffs:
                        if gene_id not in geneid2label: continue
                        for label in geneid2label[gene_id]:
                            label2segments[label].append(nsegments)
                            
                        options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                        nsegments += 1                        
                        
                for label, segments in label2segments.iteritems():
                    options.stdout.write("##Ann\t%s\t%s\n" % (label, "\t".join( ["%i" % x for x in segments ] ) ) )
                    E.info( "set %s (%s) annotated with %i segments" % (label, filename, len(segments)) )

    E.info( "ninput=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ncontigs, nsegments, ndiscarded))

    E.Stop()

Example #14

Show file

def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bamfile",
                      dest="bamfile",
                      type="string",
                      help="input bamfile to filter reads from")
    parser.add_option("-r",
                      "--reads",
                      dest="reads",
                      type="choice",
                      choices=("mapped", "unmapped"),
                      help="type of read to keep")
    parser.add_option("-s",
                      "--scriptsdir",
                      dest="scriptsdir",
                      type="string",
                      help="CGAT scripts directory")
    parser.add_option("-i",
                      "--invert",
                      dest="invert",
                      action="store_true",
                      help="invert selection - if for example unmapped reads \
                            aren't output")

    parser.set_defaults(bamfile=None,
                        reads="mapped",
                        scriptsdir="/ifs/devel/nicki/cgat_git/cgat/scripts",
                        invert=False)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()
    c.input_alignments = 0
    c.input_reads = 0
    c.output_reads = 0

    # output text file for reads TO KEEP
    bam = pysam.Samfile(options.bamfile, "rb")
    temp = P.getTempFile(".")
    E.info("iterating over bam file")

    for alignment in bam.fetch(until_eof=True):
        c.input_alignments += 1
        if options.reads == "unmapped":
            if alignment.is_unmapped:
                #c.input_alignments += 1
                temp.write(alignment.qname + "\n")
        elif options.reads == "mapped":
            if not alignment.is_unmapped:
                #c.input_alignments += 1
                temp.write(alignment.qname + "\n")
    temp.close()

    tempname = temp.name

    E.info("filtering fastq file")
    # filter fastq file
    ids = set(IOTools.readList(IOTools.openFile(tempname).readlines()))
    c.input_alignments = len(ids)
    for fastq in Fastq.iterate(options.stdin):
        c.input_reads += 1
        if (fastq.identifier.endswith("/1") or fastq.identifier.endswith("/2")
            ) and " " not in fastq.identifier:
            identifier = fastq.identifier[:-2]
        elif len(fastq.identifier.split(" ")) == 2:
            identifier = fastq.identifier.split(" ")[0]
        else:
            identifier = fastq.identifier
        if not options.invert:
            if identifier in ids:
                c.output_reads += 1
                options.stdout.write("%s\n" % fastq)
        else:
            if identifier in ids: continue
            c.output_reads += 1
            options.stdout.write("%s\n" % fastq)

    E.info(c)

    os.unlink(tempname)

    # write footer and output benchmark information.
    E.Stop()

Example #15

Show file

File: set_diff.py Project: siping/cgat

    if len(args) < 2:
        raise ValueError( "please supply at least two filenames.")

    headers, titles, sets = [], [], []

    if options.headers:
        if options.headers == "-":
            headers=args
        else:
            headers=options.headers.split(",")
            if len(headers) != len(args):
                raise ValueError ("please supply the same number of headers as there are filenames." )

    for f in args:
        if options.with_title:
            title, data = IOTools.readList( open(f,"r"), with_title = options.with_title )
            titles.append( title )
        else:
            data = IOTools.readList( open(f,"r") )
        sets.append( set( data ))
        
    if not headers and titles:
        headers = titles
    else:
        headers = args

    for x in range(len(sets)-1):
        set1=sets[x]

        for y in range(x+1, len(sets)):
            set2=sets[y]