Example #1
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    # parse command line arguments
    (op,fname) = cmdparser(args)

    # extract out the logging level early
    log_level = logging.WARNING
    if   op.verbose == 1: log_level = logging.INFO
    elif op.verbose >= 2: log_level = logging.DEBUG

    # configure logger
    logging.basicConfig(level=log_level)
    logging.getLogger('GP').setLevel(logging.WARNING)
    logging.getLogger('parameters changed meta').setLevel(logging.WARNING)

    # make sure we're only processing a single file
    if len(fname) != 1:
        if len(fname) == 0:
            sys.stderr.write("Error: Please specify the filename to process, or run with '-h' for more options\n")
        else:
            sys.stderr.write("Error: Only one input filename currently supported\n")
        sys.exit(1)

    # pull out the parental set trunction depth and validate
    depth = op.depth
    if depth < 1:
        sys.stderr.write("Error: truncation depth must be greater than or equal to one")
        sys.exit(1)

    # sanity check!
    if depth == 1:
        logger.info("Truncation depth of 1 may not be very useful")

    numprocs = op.numprocs
    if numprocs is not None and numprocs < 1:
        sys.stderr.write("Error: must have one or more worker process")
        sys.exit(1)

    # figure out where our output is going
    if op.csvoutput is None or op.csvoutput == '-':
        csvout = csv.writer(sys.stdout)
    else:
        csvout = csv.writer(open(op.csvoutput,'w'))

    if op.jsonoutput:
        jsonoutput = open(op.jsonoutput,'w')
    else:
        jsonoutput = None

    if op.hdf5output:
        hdf5output = h5.File(op.hdf5output,'w')
    else:
        hdf5output = None

    # load the data from disk
    inp = csi.loadData(fname[0])

    # check whether the second level is sorted (currently check whether all
    # levels are sorted, need to fix!)
    assert (inp.columns.is_monotonic_increasing)
    # not sure whether I can do anything similar for the rows

    if op.verbose:
        logger.info("Genes: %s",
                    ", ".join([repr(x) for x in inp.index]))
        logger.info("Treatments: %s",
                    ", ".join([repr(x) for x in inp.columns.levels[0]]))
        logger.info("Time: %s",
                    ", ".join([repr(x) for x in inp.columns.levels[1]]))

    # figure out which genes/rows we're going to process
    genes = op.genes
    if genes is None:
        logger.debug("No genes specified, assuming all")
        genes = list(inp.index)
    else:
        missing = np.setdiff1d(genes, inp.index)
        if len(missing) > 0:
            sys.stderr.write("Error: The following genes were not found: {missing}\n".format(
                missing=', '.join(missing)))
            sys.exit(1)

    # TODO: how does the user specify the parental set?

    cc = csi.Csi(inp)
    em = cc.getEm()

    if hdf5output:
        cc.write_hdf5(hdf5output)
        hdf5output.flush()

    if op.weighttrunc:
        val = float(op.weightrunc)
        if 0 < val < 1:
            sys.stderr.write("Error: The weight truncation must be between zero and one\n")
            sys.exit(1)

        if val > 0.01:
            logger.warning("weight truncation should probably be less than 0.01")

        em.weightrunc = val

    if op.initweights:
        if op.initweights == 'uniform':
            em.sampleinitweights = False
        elif op.initweights == 'weighted':
            em.sampleinitweights = True
        else:
            sys.stderr.write("Error: Unrecognised initial weight mode: {initweights}\n".join(
                initweights=op.initweights))
            sys.exit(1)

    results = []
    for i,res in enumerate(csi.runCsiEm(em, genes, lambda gene: cc.allParents(gene,depth), numprocs)):
        res.writeCsv(csvout)
        results.append(res)
        if hdf5output:
            res.write_hdf5(hdf5output, i)
            hdf5output.flush()

    if jsonoutput is not None:
        json.dump(cc.to_dom(results), jsonoutput)
Example #2
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    # parse command line arguments
    (op,fname) = cmdparser(args)

    # extract out the logging level early
    log_level = logging.WARNING
    if   op.verbose == 1: log_level = logging.INFO
    elif op.verbose >= 2: log_level = logging.DEBUG

    # configure logger
    logging.basicConfig(level=log_level)
    logging.getLogger('GP').setLevel(logging.WARNING)
    logging.getLogger('parameters changed meta').setLevel(logging.WARNING)

    # make sure we're only processing a single file
    if len(fname) != 1:
        if len(fname) == 0:
            sys.stderr.write("Error: Please specify the filename to process, or run with '-h' for more options\n")
        else:
            sys.stderr.write("Error: Only one input filename currently supported\n")
        sys.exit(1)

    # pull out the parental set trunction depth and validate
    depth = op.depth
    if depth < 1:
        sys.stderr.write("Error: truncation depth must be greater than or equal to one")
        sys.exit(1)

    # sanity check!
    if depth == 1:
        logger.info("Truncation depth of 1 may not be very useful")

    numprocs = op.numprocs
    if numprocs is not None and numprocs < 1:
        #add automatic parallelisation
        if numprocs==0:
            numprocs = mp.cpu_count()
        else:
            sys.stderr.write("Error: can't have a negative worker process count")
            sys.exit(1)

    if op.gpprior is None or op.gpprior == 'uniform':
        gpprior = None
    else:
        try:
            gpprior = parse_gp_hyperparam_priors(op.gpprior)
        except ValueError(s):
            sys.stderr.write("Error: "+s)
            sys.exit(1)

    # figure out where our output is going
    if op.csvoutput is None:
        csvoutput = None
    else:
        if op.csvoutput == '-':
            fd = sys.stdout
        else:
            fd = open(op.csvoutput,'w')
        csvoutput = csv.writer(fd)

    if op.hdf5output:
        hdf5output = h5.File(op.hdf5output,'w')
    else:
        hdf5output = None

    if hdf5output is None and csvoutput is None:
        logger.warning("No output will be saved, "
                       "this is only useful for debugging and benchmarking.")

    # load the data from disk
    inp = csi.loadData(fname[0])

    # check whether the second level is sorted (currently check whether all
    # levels are sorted, need to fix!)
    assert (inp.columns.is_monotonic_increasing)
    # not sure whether I can do anything similar for the rows
    
    #normalise the data
    if op.normalise == 'standardise':
        inp[:][:] = sp.stats.mstats.zscore(inp,axis=1,ddof=1)
    elif op.normalise == 'center':
        inp[:][:] = inp[:][:] - np.mean(inp[:][:],axis=1)[:,None]

    if op.verbose:
        logger.info("Genes: %s",
                    ", ".join([repr(x) for x in inp.index]))
        logger.info("Treatments: %s",
                    ", ".join([repr(x) for x in inp.columns.levels[0]]))
        logger.info("Time: %s",
                    ", ".join([repr(x) for x in inp.columns.levels[1]]))
        if gpprior is None:
            logger.info("Hyperparameters: uniform")
        else:
            logger.info("Hyperparameters: Gamma({0},{1})".format(*gpprior))

    # figure out which genes/rows we're going to process
    genes = op.genes
    if genes is None:
        logger.debug("No genes specified, assuming all")
        genes = list(inp.index)
    else:
        missing = np.setdiff1d(genes, inp.index)
        if len(missing) > 0:
            sys.stderr.write("Error: The following genes were not found: {missing}\n".format(
                missing=', '.join(missing)))
            sys.exit(1)

    # TODO: how does the user specify the parental set?

    cc = csi.Csi(inp)
    em = cc.getEm()

    if gpprior:
        em.set_priors(gpprior[0], gpprior[1])

    if hdf5output:
        cc.write_hdf5(hdf5output)
        hdf5output.flush()

    if op.weighttrunc:
        val = float(op.weighttrunc)
        if not (0 < val < 1):
            sys.stderr.write("Error: The weight truncation must be between zero and one\n")
            sys.exit(1)

        if val > 0.01:
            logger.warning("weight truncation should probably be less than 0.01")

        em.weighttrunc = val

    if op.initweights:
        if op.initweights == 'uniform':
            em.sampleinitweights = False
        elif op.initweights == 'weighted':
            em.sampleinitweights = True
        else:
            sys.stderr.write("Error: Unrecognised initial weight mode: {initweights}\n".join(
                initweights=op.initweights))
            sys.exit(1)

    for i,res in enumerate(csi.runCsiEm(em, genes, lambda gene: cc.allParents(gene,depth), numprocs)):
        if csvoutput:
            res.writeCsv(csvoutput)
        if hdf5output:
            res.write_hdf5(hdf5output, i)
            hdf5output.flush()