Python IOTools.getNumLines Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: IOTools

Method/Function: getNumLines

Examples at hotexamples.com: 7

Python IOTools.getNumLines - 7 examples found. These are the top rated real world Python examples of CGAT.IOTools.getNumLines extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

openFile(30)

ReadMap(23)

ReadList(21)

isEmpty(14)

writeLines(9)

readMap(9)

which(8)

getInvertedDictionary(7)

readList(7)

prettyPercent(7)

zapFile(6)

convertDictionary(6)

snip(5)

FilePool(5)

iterate(5)

getNumLines(4)

readTable(4)

flatten(4)

readMultiMap(3)

str2val(3)

touchFile(3)

writeMatrix(3)

isComplete(2)

getLastLine(2)

readMatrix(2)

val2str(2)

human2bytes(1)

force_str(1)

cloneFile(1)

prettyFloat(1)

Example #1

Show file

File: pipeline_iCLIP.py Project: shulp2211/UMIpipe

def loadClusterCounts(infiles, outfile):
    '''Find the number of signficant clusters found in each sample'''

    tmp = P.getTempFilename(shared=True)
    results = []
    for infile in infiles:
        count = IOTools.getNumLines(infile)
        method, track = re.match(
            "dedup_(.+).dir/(.+)\.clusters.bedgraph", infile).groups()
        results.append((method, track, count))
        
    IOTools.writeLines(tmp, results, header=["method", "track", "count"])

    P.load(tmp, outfile)
    os.unlink(tmp)

Example #2

Show file

File: Status.py Project: CGATOxford/CGATPipelines

    def testAnnotationIsPresent(self, track):
        '''
        PASS: File exists and is not empty

        FAIL: File exists and is empty (no data except comments)

        NA: File does not exist. This might indicate an error
            or simply that the annotation has not been computed.

        The value indicates the number of lines in the file.
        '''
        
        fn = PARAMS[track]
        if not os.path.exists(fn):
            return ('NA', 0)

        nlines = IOTools.getNumLines(fn)
        if nlines > 0:
            return ('PASS', nlines)
        else:
            return ('FAIL', nlines)

Example #3

Show file

    def testAnnotationIsPresent(self, track):
        '''
        PASS: File exists and is not empty

        FAIL: File exists and is empty (no data except comments)

        NA: File does not exist. This might indicate an error
            or simply that the annotation has not been computed.

        The value indicates the number of lines in the file.
        '''

        fn = PARAMS[track]
        if not os.path.exists(fn):
            return ('NA', 0)

        nlines = IOTools.getNumLines(fn)
        if nlines > 0:
            return ('PASS', nlines)
        else:
            return ('FAIL', nlines)

Example #4

Show file

File: bam_vs_bed.py Project: CGATOxford/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--min-overlap", dest="min_overlap",
                      type="float",
                      help="minimum overlap [%default]")

    parser.add_option("-a", "--bam-file", dest="filename_bam",
                      metavar="bam", type="string",
                      help="bam-file to use (required) [%default]")

    parser.add_option("-b", "--bed-file", dest="filename_bed",
                      metavar="bed", type="string",
                      help="bed-file to use (required) [%default]")

    parser.add_option(
        "-s", "--sort-bed", dest="sort_bed",
        action="store_true",
        help="sort the bed file by chromosomal location before "
        "processing. "
        "[%default]")

    parser.add_option(
        "--assume-sorted", dest="sort_bed",
        action="store_false",
        help="assume that the bed-file is sorted by chromosomal location. "
        "[%default]")

    parser.add_option(
        "--split-intervals", dest="split_intervals",
        action="store_true",
        help="treat split BAM intervals, for example spliced intervals, "
        "as separate intervals. Note that a single alignment might be "
        "counted several times as a result. "
        "[%default]")

    parser.set_defaults(
        min_overlap=0.5,
        filename_bam=None,
        filename_bed=None,
        sort_bed=True,
        split_intervals=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filename_bam = options.filename_bam
    filename_bed = options.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(args) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = args

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    min_overlap = options.min_overlap

    options.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(IOTools.openFile(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.Samfile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = IOTools.getNumLines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(IOTools.openFile(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = [
        "contig", "start", "end", "name",
        "score", "strand", "thickstart", "thickend", "rgb",
        "blockcount", "blockstarts", "blockends"][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend([
        "contig2", "start2", "end2", "name2",
        "score2", "strand2", "thickstart2", "thickend2", "rgb2",
        "blockcount2", "blockstarts2", "blockends2"][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    options.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # SNS: sorting optional, off by default
    if options.sort_bed:
        bedcmd = "<( zcat %s | sort -k1,1 -k2,2n)" % filename_bed
    else:
        bedcmd = filename_bed

    if options.split_intervals:
        split = "-split"
    else:
        split = ""

    # IMS: newer versions of intersectBed have a very high memory
    #      requirement unless passed sorted bed files.
    statement = """bedtools intersect %(format)s %(filename_bam)s
    -b %(bedcmd)s
    %(split)s
    -sorted -bed -wo -f %(min_overlap)f""" % locals()

    E.info("starting counting process: %s" % statement)
    proc = E.run(statement,
                 return_popen=True,
                 stdout=subprocess.PIPE)

    E.info("counting")
    counts_per_alignment = collections.defaultdict(int)
    take_columns = len(data._fields)

    def iterate(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(
            iterate(IOTools.force_str(proc.stdout)), key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1

    for key, counts in sorted(counts_per_alignment.items()):
        options.stdout.write("%s\t%i\n" % (key, counts))

    # write footer and output benchmark information.
    E.Stop()

Example #5

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--min-overlap",
                      dest="min_overlap",
                      type="float",
                      help="minimum overlap [%default]")

    parser.add_option("-a",
                      "--bam-file",
                      dest="filename_bam",
                      metavar="bam",
                      type="string",
                      help="bam-file to use (required) [%default]")

    parser.add_option("-b",
                      "--bed-file",
                      dest="filename_bed",
                      metavar="bed",
                      type="string",
                      help="bed-file to use (required) [%default]")

    parser.add_option("-s",
                      "--sort-bed",
                      dest="sort_bed",
                      action="store_true",
                      help="sort the bed file by chromosomal location before "
                      "processing. "
                      "[%default]")

    parser.add_option(
        "--assume-sorted",
        dest="sort_bed",
        action="store_false",
        help="assume that the bed-file is sorted by chromosomal location. "
        "[%default]")

    parser.add_option(
        "--split-intervals",
        dest="split_intervals",
        action="store_true",
        help="treat split BAM intervals, for example spliced intervals, "
        "as separate intervals. Note that a single alignment might be "
        "counted several times as a result. "
        "[%default]")

    parser.set_defaults(
        min_overlap=0.5,
        filename_bam=None,
        filename_bed=None,
        sort_bed=True,
        split_intervals=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filename_bam = options.filename_bam
    filename_bed = options.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(args) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = args

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    min_overlap = options.min_overlap

    options.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(IOTools.openFile(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.AlignmentFile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = IOTools.getNumLines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(IOTools.openFile(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = [
        "contig", "start", "end", "name", "score", "strand", "thickstart",
        "thickend", "rgb", "blockcount", "blockstarts", "blockends"
    ][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend([
        "contig2", "start2", "end2", "name2", "score2", "strand2",
        "thickstart2", "thickend2", "rgb2", "blockcount2", "blockstarts2",
        "blockends2"
    ][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    options.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # SNS: sorting optional, off by default
    if options.sort_bed:
        bedcmd = "<( zcat %s | sort -k1,1 -k2,2n)" % filename_bed
    else:
        bedcmd = filename_bed

    if options.split_intervals:
        split = "-split"
    else:
        split = ""

    # IMS: newer versions of intersectBed have a very high memory
    #      requirement unless passed sorted bed files.
    statement = """bedtools intersect %(format)s %(filename_bam)s
    -b %(bedcmd)s
    %(split)s
    -sorted -bed -wo -f %(min_overlap)f""" % locals()

    E.info("starting counting process: %s" % statement)
    proc = E.run(statement, return_popen=True, stdout=subprocess.PIPE)

    E.info("counting")
    counts_per_alignment = collections.defaultdict(int)
    take_columns = len(data._fields)

    def iterate(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(iterate(
            IOTools.force_str(proc.stdout)),
                                            key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1

    for key, counts in sorted(counts_per_alignment.items()):
        options.stdout.write("%s\t%i\n" % (key, counts))

    # write footer and output benchmark information.
    E.Stop()

Example #6

Show file

File: bam_vs_bed.py Project: Charlie-George/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--min-overlap", dest="min_overlap",
                      type="float",
                      help="minimum overlap [%default]")

    parser.add_option("-k", "--keep-temp", dest="keep_temp",
                      action="store_true",
                      help="do not delete temporary files [%default]")

    parser.add_option("-a", "--filename-bam", dest="filename_bam",
                      metavar="bam", type="string",
                      help="bam-file to use [%default]")

    parser.add_option("-b", "--filename-bed", dest="filename_bed",
                      metavar="bed", type="string",
                      help="bed-file to use [%default]")

    parser.set_defaults(
        min_overlap=0.5,
        keep_temp=False,
        filename_bam=None,
        filename_bed=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filename_bam = options.filename_bam
    filename_bed = options.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(args) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = args

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.close()
    tmpfilename = tmpfile.name

    min_overlap = options.min_overlap

    options.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(IOTools.openFile(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.Samfile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = IOTools.getNumLines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(IOTools.openFile(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = [
        "contig", "start", "end", "name",
        "score", "strand", "thickstart", "thickend", "rgb",
        "blockcount", "blockstarts", "blockends"][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend([
        "contig2", "start2", "end2", "name2",
        "score2", "strand2", "thickstart2", "thickend2", "rgb2",
        "blockcount2", "blockstarts2", "blockends2"][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    options.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # IMS: newer versions of intersectBed have a very high memory
    #     requirement unless passed sorted bed files.
    statement = """intersectBed %(format)s %(filename_bam)s
    -b <( zcat %(filename_bed)s | sort -k1,1 -k2,2n)
    -sorted -bed -wo -f %(min_overlap)f > %(tmpfilename)s""" % locals()

    E.info("running %s" % statement)
    retcode = E.run(statement)

    if retcode != 0:
        raise ValueError("error while executing statement %s" % statement)

    infile = open(tmpfilename, "r")
    counts_per_alignment = collections.defaultdict(int)

    E.info("counting")

    take_columns = len(data._fields)

    def iter(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(iter(infile), key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1
    infile.close()

    for key, counts in counts_per_alignment.iteritems():
        options.stdout.write("%s\t%i\n" % (key, counts))

    if not options.keep_temp:
        os.unlink(tmpfilename)

    # write footer and output benchmark information.
    E.Stop()

Example #7

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--min-overlap", dest="min_overlap", type="float",
                      help="minimum overlap [%default]")

    parser.add_option("-k", "--keep-temp", dest="keep_temp", action="store_true",
                      help="do not delete temporary files [%default]")

    parser.add_option("-a", "--filename-bam", dest="filename_bam", metavar="bam", type="string",
                      help="bam-file to use [%default]")

    parser.add_option("-b", "--filename-bed", dest="filename_bed", metavar="bam", type="string",
                      help="bed-file to use [%default]")

    parser.set_defaults(
        min_overlap=0.5,
        keep_temp=False,
        filename_bam=None,
        filename_bed=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filename_bam = options.filename_bam
    filename_bed = options.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(args) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = args

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.close()
    tmpfilename = tmpfile.name

    min_overlap = options.min_overlap

    options.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(IOTools.openFile(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.Samfile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = IOTools.getNumLines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(IOTools.openFile(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = ["contig", "start", "end", "name",
                   "score", "strand", "thickstart", "thickend", "rgb",
                   "blockcount", "blockstarts", "blockends"][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend(["contig2", "start2", "end2", "name2",
                        "score2", "strand2", "thickstart2", "thickend2", "rgb2",
                        "blockcount2", "blockstarts2", "blockends2"][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    options.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # IMS: newer versions of intersectBed have a very high memory requirement unless
    #     passed sorted bed files.
    statement = """intersectBed %(format)s %(filename_bam)s -b <( zcat %(filename_bed)s | sort -k1,1 -k2,2n) -sorted -bed -wo -f %(min_overlap)f > %(tmpfilename)s""" % locals()

    E.info("running %s" % statement)
    retcode = E.run(statement)

    if retcode != 0:
        raise ValueError("error while executing statement %s" % statement)

    infile = open(tmpfilename, "r")
    counts_per_alignment = collections.defaultdict(int)

    E.info("counting")

    take_columns = len(data._fields)

    def iter(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(iter(infile), key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1
    infile.close()

    for key, counts in counts_per_alignment.iteritems():
        options.stdout.write("%s\t%i\n" % (key, counts))

    if not options.keep_temp:
        os.unlink(tmpfilename)

    # write footer and output benchmark information.
    E.Stop()