コード例 #1
0
ファイル: makeGeneset.py プロジェクト: harmeet1990/cgat-apps
def filterGTF(gtf, filterstring, tempout):

    if "!=" in filterstring:
        column, value = filterstring.split("!=")
        value = value.split("+")
        filtertype = "notin"

    elif "=" in filterstring:
        column, value = filterstring.split("=")
        value = value.split("+")
        filtertype = "in"

    elif "-in_file-" in filterstring:
        column, value = filterstring.split("-in_file-")
        value = [line.strip() for line in iotools.open_file(value)]
        filtertype = "in_file"

    elif "-notin_file-" in filterstring:
        column, value = filterstring.split("-notin_file-")
        value = [line.strip() for line in iotools.open_file(value)]
        filtertype = "notin_file"

    elif "-morethan-" in filterstring:
        column, value = filterstring.split("-morethan-")
        value = float(value)
        filtertype = "morethan"

    elif "-lessthan-" in filterstring:
        column, value = filterstring.split("-lessthan-")
        value = float(value)
        filtertype = "lessthan"

    gfile = iotools.open_file(gtf)
    G = GTF.iterator(gfile)

    out = iotools.open_file(tempout, "w")
    for item in G:
        D = item.asDict()
        D['contig'] = item.contig
        D['source'] = item.source
        D['feature'] = item.feature
        D['start'] = item.start
        D['end'] = item.end
        D['strand'] = item.strand
        D['frame'] = item.frame

        if filtertype == "in" or filtertype == 'in_file':
            if D[column] in value:
                out.write("%s\n" % str(item))
        elif filtertype == "notin" or filtertype == 'notin_file':
            if D[column] not in value:
                out.write("%s\n" % str(item))
        elif filtertype == "morethan":
            if float(D[column]) > value:
                out.write("%s\n" % str(item))
        elif filtertype == "lessthan":
            if float(D[column]) < value:
                out.write("%s\n" % str(item))
    out.close()
    gfile.close()
コード例 #2
0
ファイル: exome.py プロジェクト: kevinrue/cgat-flow
def CleanVariantTables(genes, variants, cols, outfile):
    variants = pd.read_csv(variants, sep="\t")
    variants = variants.drop(0)

    vp1 = copy.copy(
        variants[['CHROM', 'POS', 'QUAL', 'ID', 'REF1', 'ALT', 'GT']])
    alleles = vp1['REF1'].str.cat(vp1['ALT'].str.strip(),
                                  sep=",").str.split(",")

    vp1['GT'] = vp1['GT'].str.replace(".", "0")
    inds1 = vp1['GT'].str.get(0).astype(int).values
    inds2 = vp1['GT'].str.get(-1).astype(int).values
    x = 0
    a1s = []
    a2s = []
    gts = []
    homhet = []
    for allele in alleles:
        i1 = int(inds1[x])
        i2 = int(inds2[x])
        a1 = allele[i1]
        a2 = allele[i2]
        a1s.append(a1)
        a2s.append(a2)
        if a1 == a2:
            homhet.append("HOM")
        else:
            homhet.append("HET")
        gts.append("%s%s" % (a1, a2))
        x += 1
    vp1['HOMHET'] = homhet
    vp1['Allele1'] = a1s
    vp1['Allele2'] = a2s
    vp1['Genotype'] = gts
    vp1 = vp1.drop(['REF1', 'ALT', 'GT'], 1)
    vp1[cols] = copy.copy(variants[cols])

    Ls = []
    for gene in [
            line.strip() for line in iotools.open_file(genes[0]).readlines()
    ]:
        cp = []
        with iotools.open_file(genes[1]) as infile:
            for line in infile:
                r = re.search(gene, line)
                if r:
                    line = line.strip().split("\t")
                    chrom = line[0]
                    pos = line[1]
                    cp.append("%s_%s" % (chrom, pos))
        cp = set(cp)
        for c in cp:
            Ls.append((gene, c.split("_")))
    df = pd.DataFrame(Ls)
    df['CHROM'] = df[1].str.get(0)
    df['POS'] = df[1].str.get(1)
    df = df.drop(1, 1)
    df.columns = ['gene', 'CHROM', 'POS']
    variants = vp1.merge(df, 'left')
    variants.to_csv(outfile, sep="\t")
コード例 #3
0
ファイル: farm.py プロジェクト: alsmith151/cgat-core
def chunk_iterator_lines(infile, args, prefix, use_header=False):
    """split by lines."""

    chunk_size = args[0]
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = iotools.open_file(filename, "w")
    header = None

    for line in infile:
        if line[0] == "#":
            continue

        if not header and n == 0 and use_header:
            header = line
            outfile.write(header)
            continue

        n += 1

        if n % chunk_size == 0:
            outfile.close()
            yield filename
            filename = "%s/%010i.in" % (prefix, n)
            outfile = iotools.open_file(filename, "w")
            if header:
                outfile.write(header)

        outfile.write(line)
    outfile.close()
    yield filename
コード例 #4
0
ファイル: farm.py プロジェクト: alsmith151/cgat-core
def chunk_iterator_regex_split(infile, args, prefix, use_header=False):
    """split where regular expression is true.
    """

    rex = args[0]
    chunk_size = args[2]
    max_lines = args[3]

    nlines = 0
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = iotools.open_file(filename, "w")

    for line in infile:

        if line[0] == "#":
            continue
        if rex.search(line[:-1]):
            if n > 0 and (n % chunk_size == 0 or
                          (max_lines and nlines > max_lines)):
                outfile.close()
                yield filename
                filename = "%s/%010i.in" % (prefix, n)
                outfile = iotools.open_file(filename, "w")
                nlines = 0

            n += 1

        outfile.write(line)
        nlines += 1

    outfile.close()
    yield filename
コード例 #5
0
def main(argv):

    def _add_input(parser):
        parser.add_option("--data-dir", default=".")
        parser.add_option("--force", default=False, action="store_true")
        parser.add_option("--min-depth", default=0, type="int")
        parser.add_option("--follow-links", default=False, action="store_true")
        parser.add_option("--limit-metrics", default=0, type="int")
        parser.add_option("--output-filename-metrics")
        parser.add_option("--input-filename-metrics")

    P.initialize(argv, callback=_add_input)
    options = E.get_args()

    if options.config_file:
        PARAMS = P.get_parameters(options.config_file)
    else:
        sys.exit(P.main(options))

    if os.path.exists("results.commit"):
        if not options.force:
            raise ValueError(
                "a results.commit file already exists. Please remove "
                "before uploading.")

    data_dir = os.path.abspath(options.data_dir)
    if options.input_filename_metrics:
        with IOTools.open_file(options.input_filename_metrics) as inf:
            infiles = [x.strip() for x in inf.readlines() if x.strip()]
        if options.limit_metrics:
            infiles = infiles[:options.limit_metrics]
    else:
        E.info(f"collecting files to upload starting in {data_dir}")
        infiles = []
        for root, dirs, files in os.walk(data_dir, followlinks=options.follow_links):
            E.debug(f"working on {root}: dirs={len(dirs)}, files={len(files)}")
            # ignore first level (tools) (needs better check)
            depth = root[len(data_dir):].count(os.sep)
            if "benchmark.info" in files:
                if depth <= options.min_depth:
                    E.info(f"skipping - depth not high enough: {depth}")
                else:
                    infiles.append(os.path.join(root, "benchmark.info"))

            if options.limit_metrics and len(infiles) > options.limit_metrics:
                E.info(f"stopping collection as {len(infiles)} reached")
                break

    E.info("found a potential {} benchmark.info files to upload".format(len(infiles)))
    if options.output_filename_metrics:
        with IOTools.open_file(options.output_filename_metrics, "w") as outf:
            outf.write("\n".join(infiles) + "\n")

    # find all files of interest
    oldwd = os.getcwd()
    os.chdir(data_dir)
    upload_result(infiles, "results.commit", PARAMS)
    os.chdir(oldwd)

    E.stop()
コード例 #6
0
ファイル: bed2graph.py プロジェクト: harmeet1990/cgat-apps
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-o", "--output-section", dest="output", type=str,
                        choices=("full", "name"),
                        help="output either ``full`` overlapping entries, only the ``name``s.")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              unknowns=True)

    if len(unknown) != 2:
        raise ValueError("two arguments required")

    if unknown[0] == "-":
        infile1 = args.stdin
    else:
        infile1 = iotools.open_file(unknown[0], "r")

    infile2 = iotools.open_file(unknown[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = args.output
    outfile = args.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.stop()
コード例 #7
0
def FilterFreqCols(infile, thresh, fcols):
    '''
    Returns a set of line indices indicating lines where either of the alleles
    called have a frequency of less than thresh in all of the columns specified
    in fcols.
    No information - assigned allele frequency of -1.
    '''
    fcols = fcols.split(",")
    # read the column headings from the variant table
    cols = iotools.open_file(infile).readline().strip().split("\t")
    # store allele frequency columns
    AFdict = dict()
    # store low frequency indices
    nD = dict()
    for col in fcols:
        ind = cols.index(col)
        GT_i = cols.index('GT')
        n = 0
        nlist = set()
        AFS = []
        with iotools.open_file(infile) as input:
            for line in input:
                if n > 1:
                    line = line.strip().split("\t")
                    GT = line[GT_i].replace(".", "0").split("/")
                    af = line[ind].split(",")
                    AF = []
                    # where the allele frequency is not numeric
                    # "." or "NA" use -1 to indicate no data
                    for a in af:
                        try:
                            AF.append(float(a))
                        except:
                            AF.append(float(-1))
                    AF2 = [l if l > 0 else 0 for l in AF]
                    AF = np.array(AF)
                    AF = np.insert(AF, 0, 1 - sum(AF2))
                    GT[0] = int(GT[0])
                    GT[1] = int(GT[1])
                    # If the variant is not in database the column shows "."
                    # but the site
                    # may still have been called as multi allelic
                    # - use -1 for all frequencies
                    # in this case
                    if max(GT[0], GT[1]) > (len(AF) - 1):
                        AF = [float(-1)] * (max(GT[0], GT[1]) + 1)
                    AF1 = AF[GT[0]]
                    AF2 = AF[GT[1]]
                    if AF1 >= thresh and AF2 >= thresh:
                        nlist.add(n)
                    AFS.append((AF1, AF2))
                else:
                    AFS.append(('NA', 'NA'))
                n += 1
        AFdict[col] = AFS
        nD[col] = nlist

    ns = set.union(*list(nD.values()))
    return AFdict, ns
コード例 #8
0
def extend_bed(infile, outfile):
    inf = iotools.open_file(infile)

    outf = iotools.open_file(outfile, "w")

    replace(inf, outf)

    outf.close()
コード例 #9
0
def pileup_to_quasar(infile, outfile):
    import collections
    prev_line = None
    line_buffer = list()

    outf = iotools.open_file(outfile, "w")

    fates = collections.Counter()

    for line in iotools.open_file(infile):
        fields = line.strip().split("\t")

        if not fields[3].upper() == fields[8].upper():
            fates["error in bases"] += 1
            continue

        if not (int(PARAMS["min_depth"]) <= int(fields[4]) <= int(
                PARAMS["max_depth"])):
            fates["bad read coverage"] += 1
            continue

        filt_alleles = re.sub('[a-zA-z\.\, ]\$', '', fields[5])
        filt_alleles = re.sub('\^..', '', filt_alleles)
        if len(filt_alleles) == 0:
            fates["read ends only"] += 1
            continue

        alleles = re.sub('[\.\, ]', fields[3], fields[5])
        alleles = alleles.upper()
        ref = fields[3].upper()
        alt = fields[9].upper()

        ref_count = alleles.count(ref)
        alt_count = alleles.count(alt)

        outline = "\t".join([
            fields[0], fields[1], fields[2], ref, alt, fields[7], fields[10],
            str(ref_count),
            str(alt_count),
            str(int(fields[4]) - ref_count - alt_count)
        ])

        if (fields[0], fields[1]) == prev_line:
            line_buffer.append(outline)
        else:
            if len(line_buffer) == 1:
                outf.write(line_buffer[0] + "\n")
                fates["output"] += 1
            else:
                fates["duplicate lines"] += 1
            line_buffer = [outline]
            prev_line = (fields[0], fields[1])

    outf.close()
    log = iotools.open_file(outfile + ".log", "w")
    for key in fates.keys():
        log.write("\t".join((key, str(fates[key]))) + "\n")
    log.close()
コード例 #10
0
    def run(self, infile, outfile, params):

        if "reference_fasta" in params._fields:
            reference_fasta = "REFERENCE_SEQUENCE={}".format(
                params.reference_fasta)
        else:
            reference_fasta = ""

        # command can fail when no output is produced, but still produce output
        # 12G is required for java overhead
        retval = P.run("java -Xmx8000m -jar {params.path} "
                       "CollectMultipleMetrics "
                       "{reference_fasta} "
                       "INPUT={infile} "
                       "TMP_DIR=%(tmpdir)s "
                       "{params.options} "
                       "OUTPUT={outfile} "
                       ">& {outfile} ".format(**locals()),
                       job_memory="12G",
                       ignore_errors=True)

        def get_section(section, data):
            pattern = "## {}".format(section)
            keep = False
            result = []
            for line in data:
                if line.startswith("##"):
                    if line.startswith(pattern):
                        keep = True
                    else:
                        keep = False
                if keep:
                    result.append(line)
            return result

        for tablename in self.tablenames:
            filename = re.sub("histogram", "metrics", tablename)
            raw = filename[len("picard_"):]
            src = outfile + "." + raw
            dest = outfile + "." + tablename + ".tsv"

            if not os.path.exists(src):
                E.warn("no file {}, ignored".format(src))
                continue

            with IOTools.open_file(src) as inf:
                data = inf.readlines()

            if tablename.endswith("metrics"):
                data = get_section("METRICS", data)
            elif tablename.endswith("histogram"):
                data = get_section("HISTOGRAM", data)

            with IOTools.open_file(dest, "w") as outf:
                outf.write("".join(data))

        return retval
コード例 #11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-a", "--first-fastq-file", dest="fastq1", type=str,
        help="supply read1 fastq file")
    parser.add_argument(
        "-b", "--second-fastq-file", dest="fastq2", type=str,
        help="supply read2 fastq file")

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              unknowns=True)

    if unknown and len(unknown) == 2:
        args.fastq1, args.fastq2 = unknown

    fastq1 = iotools.open_file(args.fastq1)
    fastq2 = iotools.open_file(args.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in zip_longest(Fastq.iterate(fastq1),
                              Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError(
                    "unpaired reads detected. Are files sorted? are "
                    "files of equal length?")
            except PairedReadError as e:
                raise PairedReadError(e).with_traceback(sys.exc_info()[2])
        else:
            assert f1.identifier.endswith("/1") and \
                f2.identifier.endswith("/2"), \
                "Reads in file 1 must end with /1 and reads in file 2 with /2"
            args.stdout.write(
                ">%s\n%s\n>%s\n%s\n" %
                (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1

    E.info("output: %i pairs" % f1_count)

    # write footer and output benchmark information.
    E.stop()
コード例 #12
0
ファイル: test_iotools.py プロジェクト: pythseq/cgat-core
 def test_touch_file_updates_existing_file(self):
     with iotools.open_file(self.filename, "w") as outf:
         outf.write("some data\n")
     created = os.stat(self.filename).st_mtime
     time.sleep(1)
     iotools.touch_file(self.filename)
     modified = os.stat(self.filename).st_mtime
     self.assertGreater(modified, created)
     with iotools.open_file(self.filename) as inf:
         data = inf.read()
     self.assertEqual(data, "some data\n")
コード例 #13
0
def filterDamage(infile, damagestr, outfiles):
    '''
    Filter variants which have not been assessed as damaging by any
    of the specified tools.
    Tools and thresholds can be specified in the pipeline.yml.

    Does not account for multiple alt alleles - if any ALT allele has
    been assessed as damaging with any tool the variant is kept,
    regardless of if this is the allele called in the sample.

    '''
    damaging = damagestr.split(",")
    cols = iotools.open_file(infile).readline().strip().split("\t")

    D = dict()
    # parses the "damage string" from the pipeline.yml
    # this should be formatted as COLUMN|result1-result2-...,COLUMN|result1...
    # where variants with any of these results in this column will
    # be retained
    for d in damaging:
        d = d.split("|")
        col = d[0]
        res = d[1].split("-")
        i = cols.index(col)
        D[col] = ((res, i))

    x = 0
    out = iotools.open_file(outfiles[0], "w")
    out2 = iotools.open_file(outfiles[1], "w")
    with iotools.open_file(infile) as input:
        for line in input:
            if x > 1:
                # grep for specific strings within this column of this
                # line of the input file
                line = line.strip().split("\t")
                isdamaging = 0
                for key in D:
                    res, i = D[key]
                    current = line[i]
                    for r in res:
                        if re.search(r, current):
                            isdamaging = 1
                if isdamaging == 1:
                    out.write("%s\n" % "\t".join(line))
                else:
                    out2.write("%s\n" % "\t".join(line))
            else:
                out.write(line)
            x += 1
    out.close()
    out2.close()
コード例 #14
0
ファイル: motifs.py プロジェクト: kevinrue/cgat-flow
def runRegexMotifSearch(infiles, outfile):
    '''run a regular expression search on sequences.
    compute counts.
    '''

    motif = "[AG]G[GT]T[CG]A"
    reverse_motif = "T[GC]A[CA]C[TC]"

    controlfile, dbfile = infiles
    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    motifs = []
    for x in range(0, 15):
        motifs.append(
            ("DR%i" % x, re.compile(motif + "." * x + motif, re.IGNORECASE)))
    for x in range(0, 15):
        motifs.append(("ER%i" % x,
                       re.compile(motif + "." * x + reverse_motif,
                                  re.IGNORECASE)))

    db_positions = Motifs.countMotifs(iotools.open_file(dbfile, "r"), motifs)
    control_positions = Motifs.countMotifs(iotools.open_file(controlfile, "r"),
                                           motifs)

    db_counts, control_counts = Motifs.getCounts(
        db_positions), Motifs.getCounts(control_positions)
    db_seqcounts, control_seqcounts = Motifs.getOccurances(
        db_positions), Motifs.getCounts(control_positions)

    ndb, ncontrol = len(db_positions), len(control_positions)
    outf = iotools.open_file(outfile, "w")
    outf.write(
        "motif\tmotifs_db\tmotifs_control\tseq_db\tseq_db_percent\tseq_control\tseq_control_percent\tfold\n"
    )
    for motif, pattern in motifs:
        try:
            fold = float(db_seqcounts[motif]) * \
                ncontrol / (ndb * control_seqcounts[motif])
        except ZeroDivisionError:
            fold = 0

        outf.write(
            "%s\t%i\t%i\t%i\t%s\t%i\t%s\t%5.2f\n" %
            (motif, db_counts[motif], control_counts[motif],
             db_seqcounts[motif],
             iotools.pretty_percent(db_seqcounts[motif],
                                    ndb), control_seqcounts[motif],
             iotools.pretty_percent(control_seqcounts[motif], ncontrol), fold))
コード例 #15
0
def loadManualAnnotations(infile, outfile):

    tmp = P.get_temp_filename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with iotools.open_file(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with iotools.open_file(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)
コード例 #16
0
ファイル: exomeancestry.py プロジェクト: tw7649116/cgat-flow
def GenotypeSNPs(infile, snplist, outfile):
    '''
    Fetches the genotype from the variant tables for all samples
    for SNPs in the hapmap sample from makeRandomSNPSet.

    Complex sites are ignored (as simple SNPs are sufficient for these
    calculations).
    These are:
        Sites which failed QC (column 3 in the variant table is not PASS)
        Sites with more than 2 alleles defined (column 6 in the variant table
        contains more than one alternative allele)
        SNPs with more than one ID
        Indels
    '''
    out = iotools.open_file(outfile, "w")
    with iotools.open_file(infile) as inf:
        for line in inf:
            line = line.strip().split()
            # if the variant passed QC
            if line[4] == "PASS":
                genotype = line[7]
                # if the genotype looks normal e.g. 1/1
                if len(genotype) == 3:
                    # get the actual genotype (rather than the index)
                    if genotype[0] != ".":
                        ind1 = int(genotype[0])
                    else:
                        ind1 = 0
                    if genotype[2] != ".":
                        ind2 = int(genotype[2])
                    else:
                        ind2 = 0
                    A1 = line[5]
                    A2 = line[6].split(",")
                    AS = [A1] + A2

                    if len(AS) <= 2:
                        GT = "%s%s" % (AS[ind1], AS[ind2])
                        refGT = "%s%s" % (A1, A1)
                        if len(GT) == 2:
                            if line[3][0:2] == "rs" and len(
                                    line[3].split(";")) == 1:
                                snpid = line[3]
                                chrom = line[0]
                                pos = line[1]
                                if snpid in snplist:
                                    out.write("%s\t%s\t%s\t%s\t%s\n"
                                              % (snpid, chrom, pos, GT,
                                                 refGT))
    out.close()
コード例 #17
0
ファイル: pipeline_atac.py プロジェクト: Acribbs/cribbslab
def generate_bedfile(infile, outfile):
    '''Convert to bed file 50bp +/- from summit'''

    infile = iotools.open_file(infile)
    outfile = iotools.open_file(outfile, "w")
    for line in infile:

        chrom, start, end, peak, value = line.strip().split("\t")
        start = int(start) - 50
        end = int(end) + 50

        outfile.write("%s\t%s\t%s\t%s\t%s\n" %
                      (chrom, start, end, peak, value))
    outfile.close()
コード例 #18
0
def clean(files, logfile):
    '''clean up files given by glob expressions.

    Files are cleaned up by zapping, i.e. the files are set to size
    0. Links to files are replaced with place-holders.

    Information about the original file is written to `logfile`.

    Arguments
    ---------
    files : list
        List of glob expressions of files to clean up.
    logfile : string
        Filename of logfile.

    '''
    fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev',
              'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev',
              'st_size', 'st_uid')

    dry_run = get_params().get("dryrun", False)

    if not dry_run:
        if not os.path.exists(logfile):
            outfile = iotools.open_file(logfile, "w")
            outfile.write("filename\tzapped\tlinkdest\t%s\n" %
                          "\t".join(fields))
        else:
            outfile = iotools.open_file(logfile, "a")

    c = E.Counter()
    for fn in files:
        c.files += 1
        if not dry_run:
            stat, linkdest = iotools.zap_file(fn)
            if stat is not None:
                c.zapped += 1
                if linkdest is not None:
                    c.links += 1
                outfile.write(
                    "%s\t%s\t%s\t%s\n" %
                    (fn, time.asctime(time.localtime(time.time())), linkdest,
                     "\t".join([str(getattr(stat, x)) for x in fields])))

    get_logger().info("zapped: %s" % (c))
    outfile.close()

    return c
コード例 #19
0
def process_remote(infile):

    repository, acc = iotools.open_file(infile).readlines()[0].strip().split()

    if repository == "ENCODE":
        location, filetype = get_encode_file(acc)
    elif repository == "URL":
        location = acc
        if acc.endswith("gz"):
            filetype = ".".join(acc.split(".")[-2])
        else:
            filetype = acc.split(".")[-1]
    else:
        raise ValueError("repository %s not yet supported" % repository)

    tmpfile = P.get_temp_filename(shared=False, suffix="." + filetype)

    preamble = "wget %(location)s -O %(tmpfile)s --quiet &&"
    postamble = "&&  rm %(tmpfile)s"

    if filetype == "bam":
        preamble += "samtools index %(tmpfile)s && "
        postamble += " && rm %(tmpfile)s.bai "
    elif filetype == "bed.gz":
        tmp2 = P.get_temp_filename(shared=False)
        preamble += ''' zcat %(tmpfile)s | sort -k1,1 -k2,2n | bgzip > %(tmp2)s &&
                        mv %(tmp2)s %(tmpfile)s &&
                        tabix -p bed %(tmpfile)s && '''
        postamble += "&& rm %(tmpfile)s.tbi"

    return preamble % locals(), postamble % locals(), tmpfile, filetype
コード例 #20
0
def parseMutectCallStats(infile, outfile):
    '''take the call stats outfile from mutect and summarise the
    reasons for variant rejection'''

    single_dict = collections.defaultdict(int)
    combinations_dict = collections.defaultdict(int)

    with iotools.open_file(infile, "rb") as infile:
        lines = infile.readlines()
        for i, line in enumerate(lines):
            if i < 2:
                continue
            values = line.strip().split("\t")
            judgement, justification = (values[-1], values[-2])
            if judgement == "REJECT":
                reasons = justification.split(",")
                if len(reasons) == 1:
                    single_dict[reasons[0]] += 1
                else:
                    for reason in reasons:
                        combinations_dict[reasons[0]] += 1

    df = pd.DataFrame([single_dict, combinations_dict])

    df = df.transpose()
    df.columns = ["single", "combination"]
    df = df.sort("single", ascending=False)
    df.index.name = "justification"
    df.to_csv(outfile, header=True, index=True, sep="\t")
コード例 #21
0
def buildMisprimingLib(infiles, outfile):
    '''
    build fasta file of sequences to check for mispriming
    '''
    fasta, identifiers = infiles
    inf = IOTools.open_file(fasta)
    
    E.info("reading ids for sequences to keep")
    ids = readIdentifiers(identifiers)

    outf = IOTools.open_file(outfile, "w")
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.open_file(fasta)):
        if f.title not in ids:
            outf.write(">%s\n%s\n" % (f.title, f.sequence))
    outf.close()
コード例 #22
0
def buildOptimalPrimerSet(infiles, outfile):
    '''
    build a set of optimal primer pairs across sequences
    '''
    outf = IOTools.open_file(outfile, "w")
    outf.write("""name\tforward_seq\tforward_gc (%) \tforward_tm\tforward_length (bp)\treverse_seq\treverse_gc (%)\treverse_tm\treverse_length (bp)\tfragment_length (bp)\n""")
    for infile in infiles:
        primerset = PrimerSet()
        name = primerset.readName(infile)
        size = primerset.readSize(infile)
        forward = primerset.readForward(infile)
        E.info(forward)
        reverse = primerset.readReverse(infile)
        primerset = primerset.parse(attributes=[name, size] + list(forward) + list(reverse))
        outf.write("\t".join([primerset.name, 
                              primerset.forwardseq, 
                              primerset.forwardgc, 
                              primerset.forwardtm,
                              primerset.forwardlength,
                              primerset.reverseseq,
                              primerset.reversegc, 
                              primerset.reversetm, 
                              primerset.reverselength,
                              primerset.size]) + "\n")
    outf.close()
コード例 #23
0
def getCpGIslandsFromUCSC(dbhandle, outfile):
    '''get CpG islands from UCSC database and save as a :term:`bed`
    formatted file.

    The name column in the bed file will be set to the UCSC name.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    outfile : string
       Filename of output file in :term:`bed` format.
    '''

    table = "cpgIslandExt"
    sql = """SELECT chrom, chromStart, chromEnd, name
    FROM %(table)s ORDER by chrom, chromStart"""
    sql = sql % locals()

    E.debug("executing sql statement: %s" % sql)
    try:
        cc = dbhandle.execute(sql)
        outfile = iotools.open_file(outfile, "w")
        for data in cc.fetchall():
            outfile.write("\t".join(map(str, data)) + "\n")
        outfile.close()
    except Exception:
        E.warn("Failed to connect to table %s. %s is empty" % (table, outfile))
        iotools.touch_file(outfile)
コード例 #24
0
    def run(self, infile, outfile, params):

        if params.reference_fasta_map is None:
            raise ValueError("bam2reference requires a reference sequence map")

        reference_fasta_map = build_reference_fasta_map(
            params.reference_fasta_map)

        fasta = resolve_argument(list(reference_fasta_map.values()),
                                 ",").split(",")
        retval, diff = get_reference_for_bam(infile, fasta)
        if retval is None:
            if diff is None:
                retval = "corrupted"
            else:
                retval = "unknown"
                E.debug("differences: {}".format(str(diff)))
            path = ""
        else:
            map_path2name = dict([(x[1], x[0])
                                  for x in list(reference_fasta_map.items())])
            path = map_path2name.get(retval, os.path.basename(retval))

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("filename\treference\tpath\n")
            outf.write("\t".join((infile, retval, path)) + "\n")

        return None
コード例 #25
0
ファイル: split_gff.py プロジェクト: alphaneer/cgat-apps
    def createOpen(self, mode="w", header=None):
        """open file. Check first, if directory exists.
        """

        self.nchunk += 1
        filename = self.output_filename_pattern % self.nchunk

        if self.dry_run:
            E.info("opening file %s" % filename)
            returniotools.open_file("/dev/null", mode)

        if mode in ("w", "a"):
            dirname = os.path.dirname(filename)
            if dirname and not os.path.exists(dirname):
                os.makedirs(dirname)

        if os.path.exists(filename):
            existed = True
        else:
            existed = False

        f = iotools.open_file(filename, mode)

        if header and not existed:
            f.write(header + "\n")

        return f
コード例 #26
0
def split_gtf_by_category(infiles, outfiles, catname):

    catfile, gtffile = infiles
    categories = pd.read_csv(catfile, index_col=0, squeeze=True, sep="\t")

    # create output filepool
    outpool = iotools.FilePool("{}_%s.gtf.gz".format(catname), force=True)

    gtffile = iotools.open_file(gtffile)

    for gtfline in gtf.iterator(gtffile):

        try:
            transcript_id = gtfline.transcript_id
        except AttributeError:
            transcript_id = None

        try:
            gene_id = gtfline.gene_id
        except AttributeError:
            gene_id = None

        if transcript_id in categories.index:
            outpool.write(categories[transcript_id], str(gtfline) + "\n")
        elif gene_id in categories.index:
            outpool.write(categories[gene_id], str(gtfline) + "\n")

    outpool.close()
コード例 #27
0
ファイル: motifs.py プロジェクト: kevinrue/cgat-flow
def runGLAM2SCAN(infiles, outfile):
    '''run glam2scan on all intervals and motifs.
    '''

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles
    controlfile = dbfile[:-len(".fasta")] + ".controlfasta"
    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    if os.path.exists(outfile):
        os.remove(outfile)

    for motiffile in motiffiles:
        of = iotools.open_file(outfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s ::\n" % motif)
        of.close()

        statement = '''
        cat %(dbfile)s %(controlfile)s
        | %(execglam2scan)s -2 -n %(glam2scan_results)i n %(motiffile)s - >> %(outfile)s
        '''
        P.run(statement)
コード例 #28
0
def read_table(filename, options):
    '''read table and filter as an iterator.
    '''

    if os.path.exists(filename):
        lines = iotools.open_file(filename, "r")
    else:
        lines = (x for x in [])

    # extract table by regular expression

    enumerated_lines = enumerate(lines)
    if options.regex_start:
        rx = re.compile(options.regex_start)
        for n, line in enumerated_lines:
            if rx.search(line):
                E.info("reading table from line %i" % n)
                if not line.startswith("#") and line.strip():
                    yield line
                break
        else:
            E.info("start regex not found - no table")

    if options.regex_end:
        rx = re.compile(options.regex_end)

    for n, line in enumerated_lines:

        if options.regex_end and rx.search(line):
            break

        if not line.startswith("#") and line.strip():
            yield line
コード例 #29
0
    def run(self, infiles, outfile, params):
        def _link(infile, outfile):
            if os.path.exists(os.path.abspath(outfile)):
                return

            dirname = os.path.dirname(outfile)
            if not os.path.exists(dirname):
                os.makedirs(dirname)
            os.symlink(infile, os.path.abspath(outfile))

        rx = re.compile(params.regex)

        outfiles = []
        for infile in infiles:

            outpath = os.path.join(
                os.path.dirname(outfile),
                rx.search(infile).expand(params.pattern_out))

            for suffix in self.suffixes:
                for fn in glob.glob(infile + suffix):
                    _link(fn, outpath + suffix)
            _link(os.path.abspath(infile), outpath)
            outfiles.append(outpath)

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("\n".join(outfiles) + "\n")
コード例 #30
0
def getGeneTable(reffile):
    E.info("Loading reference")
    table = defaultdict(dict)
    for ens_gene in GTF.gene_iterator(GTF.iterator(
            IOTools.open_file(reffile))):
        geneid = ens_gene[0][0].gene_id
        table[geneid]["models"] = dict()
        table[geneid]["start_codons"] = defaultdict(list)

        for transcript in ens_gene:

            transcript_id = transcript[0].transcript_id
            table[geneid]["models"][transcript_id] = transcript

            CDS = GTF.asRanges(transcript, "start_codon")
            if len(CDS) == 0:
                continue

            if transcript[0].strand == "-":
                start_codon = max(e[1] for e in CDS)
            else:
                start_codon = min(e[0] for e in CDS)

            table[geneid]["start_codons"][start_codon].append(transcript_id)

    E.info("Reference Loaded")
    return table