Beispiel #1
0
def dodict(action, alias=None, name=None, always=False, clean=False):
    to_clean = []
    cstrings = {}
    # long actions can be supplied as list
    if type(action) is list:
        action = " ".join(action)
    cstrings["action_original"] = action
    # replace aliased items via python formatting
    if alias is None:
        alias = {}
    for old, new in c_default_alias.items():
        if old in alias and alias[old] != new:
            print >> sys.stderr, "warning, default alias overwrite:", old, alias[
                old], "<--", new
        alias[old] = new
    action = action.format(**alias)
    # process file deps
    cstrings["action_formatted"] = action
    file_dep = []
    for match in re.finditer("([Dd]):(.*?)(\s|$)", action):
        flag, item, other = match.groups()
        if flag == "d":
            file_dep.append(item)
        elif flag == "D":
            cstrings[item] = status(item)
    action = re.sub("[Dd]:", "", action)
    # process targets
    targets = []
    for match in re.finditer("([Tt]):(.*?)(\s|$)", action):
        flag, item, other = match.groups()
        if flag == "t":
            targets.append(item)
        elif flag == "T":
            if not clean:
                sys.exit(
                    "Lethal Error: Folder target 'T:' used without invoking clean=True"
                )
            targets.append(item)
            to_clean.append(item)
    action = re.sub("[Tt]:", "", action)
    if len(targets) == 0:
        say("Action has no targets?\n\t{}".format(cstrings["action_original"]))
    # remove commented items
    action = re.sub(" +#.*", "", action)
    cstrings["action_uncommented"] = action
    # expected task dictionary (augmented below)
    doitdict = {
        "targets": targets,
        "file_dep": file_dep,
        "actions": [(clean_targets, to_clean), (mkdirs, targets), action],
        "uptodate": [not always, config_changed(cstrings)],
        "verbosity": 2,
    }
    if name is not None:
        if type(name) is not str:
            name = ":".join([str(k) for k in name])
        doitdict["name"] = name
    # return task dictionary
    return doitdict
Beispiel #2
0
def fdr(pvalues, **kwargs):
    if type(pvalues) is list:
        return pvalues2values(pvalues, **kwargs)
    elif type(pvalues) is dict:
        return pdict2qdict(pvalues, **kwargs)
    else:
        say("Can't FDR non-list, non-dict")
        return None
Beispiel #3
0
def deduplicate(items):
    seen = set()
    deduplicated = False
    for i, k in enumerate(items):
        while k in seen:
            deduplicated = True
            k += "-dup"
        items[i] = k
        seen.add(k)
    if deduplicated: zu.say("Some fields were deduplicated.")
Beispiel #4
0
def main():
    args = get_args()
    # load genes (accounting for linkage)
    genes = col2dict(
        args.genes,
        value=(1 if args.linked else None),
        headers=args.skip_headers,
    )
    genes = {g: (g if k is None else k) for g, k in genes.items()}
    # load background (accounting for linkage)
    background = None
    if args.background is not None:
        background = col2dict(
            args.background,
            value=(1 if args.linked else None),
            headers=args.skip_headers,
        )
        background = {
            g: (g if k is None else k)
            for g, k in background.items()
        }
    # load gene sets
    gene_sets = polymap(
        args.gene_sets,
        reverse=args.reversed_mapping,
    )
    # run analysis
    results = fisher_enrich(
        genes,
        gene_sets,
        depletions=not args.exclude_depletions,
        background=background,
        intersect_background=args.intersect_background,
        intersect_annotated=args.intersect_annotated,
        fdr=args.fdr,
        min_expected_overlap=args.min_expected_overlap,
        verbose=False,
    )
    # write results
    fh = open(args.outfile, "w") if args.outfile is not None else sys.stdout
    writer = csv.writer(fh, dialect="excel-tab")
    writer.writerow(c_fisher_fields)
    for R in results:
        writer.writerow(R.row())
    # wrapup
    if len(results) == 0:
        say("# NO SIGNIFICANT ENRICHMENTS")
    fh.close()
    return None
Beispiel #5
0
def index(xx, yy):
    """ provides the index positions of ( 0,y ) ( x,0 ) and ( 0,0 ) points """
    xzeroes = []
    yzeroes = []
    nonzero = []
    discard = 0
    for i, (x, y) in enumerate(zip(xx, yy)):
        if x > 0 and y > 0:
            nonzero.append(i)
        elif x > 0 and y == 0:
            yzeroes.append(i)
        elif y > 0 and x == 0:
            xzeroes.append(i)
        else:
            discard += 1
    if discard > 0:
        say("zillplot ignoring", discard, "( 0,0 ) points")
    return xzeroes, yzeroes, nonzero
Beispiel #6
0
def annotation_report(message, linking, annotations):
    is_linked = not all([k == v for k, v in linking.items()])
    say(message)
    is_annotated = generate_background(annotations)
    n_keys = len(linking)
    n_keys_annotated = len(
        {key
         for key, link in linking.items() if link in is_annotated})
    is_link = set(linking.values())
    n_links = len(is_link)
    n_links_annotated = len(is_link.__and__(is_annotated))
    # outer key results
    say("  Total keys: {:,}".format(n_keys))
    say("  Annotated keys: {:,} ({:.1f}%)".format(
        n_keys_annotated, 100 * n_keys_annotated / (c_eps + n_keys)))
    # inner key (link) results
    if is_linked:
        say("  Total links: {:,}".format(n_links))
        say("  Annotated links: {:,} ({:.1f}%)".format(
            n_links_annotated, 100 * n_links_annotated / (c_eps + n_links)))
    return None
Beispiel #7
0
def main():
    args = get_args()

    # load key values
    def make_link(row):
        key = row[1] if args.linked else row[0]
        return Link(key, float(row[-1]))

    values = col2dict(
        args.values,
        func=make_link,
        headers=args.skip_headers,
    )
    # load key sets
    gene_sets = polymap(
        args.gene_sets,
        reverse=args.reversed_mapping,
    )
    # perform analysis
    results = rank_enrich(
        values,
        gene_sets,
        depletions=not args.exclude_depletions,
        intersect_annotated=args.intersect_annotated,
        fdr=args.fdr,
        min_overlap=args.min_overlap,
        verbose=True,
    )
    # write results
    fh = open(args.outfile, "w") if args.outfile is not None else sys.stdout
    writer = csv.writer(fh, dialect="excel-tab")
    writer.writerow(c_rank_fields)
    for R in results:
        writer.writerow(R.row())
    # wrapup
    if len(results) == 0:
        say("# NO SIGNIFICANT ENRICHMENTS")
    fh.close()
    return None
Beispiel #8
0
 def __init__(self, gff_row, counter):
     # unique tag for locus based on position in GFF
     self.index = counter
     # gff fields
     if len(gff_row) != len(c_gff_fields):
         zu.die("Bad GFF row:", gff_row)
     for [fname, ftype], value in zip(c_gff_fields, gff_row):
         setattr(self, fname, ftype(value) if value != "." else value)
     # attributes
     temp = {}
     for item in self.attributes.split(";"):
         if "=" not in item:
             continue
         item = item.strip()
         system, value = item.split("=")
         if system in temp:
             zu.say("Warning: Multiple definitions for system", system)
         temp[system] = value
     self.attributes = temp
     # no name by default
     self.name = self.attributes.get("ID", None)
     self.code = ":".join([str(self.start), str(self.end), self.strand])
Beispiel #9
0
def preprocess_annotations(annotations, min_size):
    ni = nf = len(annotations)
    if min_size is not None:
        annotations = {
            k: v
            for k, v in annotations.items() if len(v) >= min_size
        }
        nf = len(annotations)
    say("Annotations:")
    say("  Loaded: {:,}".format(ni))
    if ni != nf:
        say("  After filtering: {:,} ({:.1f}%)".format(nf, 100.0 * nf / ni))
    return annotations
Beispiel #10
0
def progress(counter, annotations):
    say("Testing annotation {: >5d} of {: >5d}".format(counter,
                                                       len(annotations)))
Beispiel #11
0
#!/usr/bin/env python

from __future__ import print_function

import os
import sys
import re
from collections import OrderedDict

from zopy.utils import try_open, say, die

try:
    from Bio import SeqIO
except:
    say("zopy.bio imported with biopython")

# ---------------------------------------------------------------
# fasta
# ---------------------------------------------------------------


def read_fasta(path, full_headers=False):
    fdict = OrderedDict()
    with try_open(path) as fh:
        for line in fh:
            line = line.strip()
            if line[0] == ">":
                header = line[1:]
                if not full_headers:
                    header = header.split()[0].rstrip("|")
            else:
Beispiel #12
0
    "-m",
    "--mode",
    choices=["piped", "piped_humann"],
    help="special sorting options",
)
args = parser.parse_args()

# ---------------------------------------------------------------
# load all data
# ---------------------------------------------------------------

dictTableData = {}
# modified for faster looking up 4/2/2015
dictFeatureIndex = {}

say("Will load:", len(args.inputs), "gathered from command line")

if args.file is not None:
    before = len(args.inputs)
    with open(args.file) as fh:
        for line in fh:
            args.inputs.append(line.strip())
    after = len(args.inputs)
    say("Will load:", after - before, "additional files gathered from:",
        args.file)

for iDex, strPath in enumerate(args.inputs):
    say(sys.stderr, "Loading", iDex + 1, "of", len(args.inputs), ":", strPath)
    aastrData = []
    strColhead = path2name(
        strPath) if not args.use_full_names else os.path.split(strPath)[1]
Beispiel #13
0
 def report(self, *args, **kwargs):
     items = [self.sourcename, "::", " ".join([str(k) for k in args])]
     if kwargs.get("die", False):
         zu.die(*items)
     elif self.verbose:
         zu.say(*items)
Beispiel #14
0

def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("fasta1")
    parser.add_argument("fasta2")
    parser.add_argument("-t", "--top", default=1, type=int)
    parser.add_argument("-l", "--local", action="store_true")
    parser.add_argument("-k", "--k-size", default=3, type=int)
    parser.add_argument("-c", "--compress", default=None, type=float)
    return parser.parse_args()


if __name__ == "__main__":
    args = get_args()
    zu.say("Loading fasta1")
    fasta1 = read_fasta(args.fasta1)
    zu.say("Loading fasta2")
    fasta2 = read_fasta(args.fasta2)
    zu.say("Indexing fasta2")
    I = Index(k=args.k_size)
    I.update_from_dict(fasta2)
    if args.compress:
        zu.say("Compressing index")
        I.compress(args.compress)
    zu.say("Searching")
    for i, name1 in enumerate(fasta1):
        seq = fasta1[name1]
        hits = I.score(seq, top=args.top, local=args.local)
        for hit in hits:
            zu.tprint(
Beispiel #15
0

def trunc_normal(m, sd, zmax):
    outlier = True
    while outlier:
        sim = normal(m, sd)
        if abs(m - sim) / sd < zmax:
            outlier = False
    return sim


#-------------------------------------------------------------------------------
# munge hmp data
#-------------------------------------------------------------------------------

zu.say(args.basename, "->", "parsing HMP data")

T = table(args.hmp)
T.select("STSite", args.site, transposed=True)
T.select("VISNO", "1", transposed=True)
T.head("SRS", invert=True)
T.apply_rowheads(lambda x: x.split("|")[-1])
T.grep("headers", "s__")
T.grep("headers", "_unclassified", invert=True)
T.dump("subset.tmp")
T.float()
T.unrarify(1e-20, 1)

bugs = []
for bug, row in T.iter_rows():
    stats = []
Beispiel #16
0
args = parser.parse_args()

#-------------------------------------------------------------------------------
# constants
#-------------------------------------------------------------------------------

cafa_codes = {"EXP", "TAS", "IC"}

#-------------------------------------------------------------------------------
# load gene set
#-------------------------------------------------------------------------------

genes = set()
for row in iter_rows(args.gene_list):
    genes.add(row[0])
say("Loaded", len(genes), "genes")

#-------------------------------------------------------------------------------
# process goa file
#-------------------------------------------------------------------------------
"""
The GOA (.gaf) file is tab-delimited. 
Comment lines start with "!"
Col2 is the uniprot id (a superset of uniref50).
Col5 is the Gene Ontology annotation.
Col4 is a logical modifier of the uniprot->go mapping.
  Must exclude the cases where this is "NOT".
Col7 is a short evidence code
"""

# term->gene mapping