def main():
    database = args.experiment
    database += "_samples_raw"
    db_args = {}
    db_args['dbhost'] = args.dbhost
    db_args['dbport'] = args.dbport
    db_args['database'] = database
    db_args['dbuser'] = None
    db_args['dbpassword'] = None
    pp_db = data.PostProcessingDatabase(db_args)

    log.info("Opening %s output files for resampling jobs", args.parallelism)
    num_files = int(args.parallelism)
    file_list = []
    base_name = "resamplejob-"
    base_name += args.experiment
    base_name += "-"

    for i in range(0, num_files):
        filename = ''
        if args.jobdirectory is not None:
            filename = args.jobdirectory + "/"
        filename += base_name
        filename += str(uuid.uuid4())
        filename += ".sh"

        log.debug("job file: %s", filename)
        f = open(filename, 'w')

        f.write("#!/bin/sh\n\n")
        file_list.append(f)

    file_cycle = itertools.cycle(file_list)

    for file in os.listdir(args.inputdirectory):
        if fnmatch.fnmatch(file, '*.txt'):
            full_fname = args.inputdirectory
            full_fname += "/"
            full_fname += file

            cmd = generate_sample_command(args.experiment, full_fname, args.outputdirectory,
                                          args.samplesize, args.dropthreshold)

            fc = file_cycle.next()
            log.debug("cmd: %s", cmd)
            fc.write(cmd)
            fc.write('\n')

    for fh in file_list:
        fh.close()
def main():
    database = args.experiment
    database += "_samples_raw"
    db_args = {}
    db_args['dbhost'] = args.dbhost
    db_args['dbport'] = args.dbport
    db_args['database'] = database
    db_args['dbuser'] = None
    db_args['dbpassword'] = None
    pp_db = data.PostProcessingDatabase(db_args)

    log.info("Opening %s output files for data export jobs", args.parallelism)
    num_files = int(args.parallelism)
    file_list = []
    base_name = "exportjob-"
    base_name += args.experiment
    base_name += "-"

    for i in range(0, num_files):
        filename = ''
        if args.jobdirectory is not None:
            filename = args.jobdirectory + "/"
        filename += base_name
        filename += str(uuid.uuid4())
        filename += ".sh"

        log.debug("job file: %s", filename)
        f = open(filename, 'w')

        f.write("#!/bin/sh\n\n")
        file_list.append(f)

    file_cycle = itertools.cycle(file_list)

    # get a list of the input files from the database
    with open(args.simidfile, 'r') as simid_file:
        for s in simid_file:

            cmd = generate_export_commandline(args.experiment,
                                              args.outputdirectory, s)

            fc = file_cycle.next()
            log.debug("cmd: %s", cmd)
            fc.write(cmd)
            fc.write('\n')

    for fh in file_list:
        fh.close()
Esempio n. 3
0
def doExport():
    database = args.experiment
    database += "_samples_raw"
    db_args = {}
    db_args['dbhost'] = args.dbhost
    db_args['dbport'] = args.dbport
    db_args['database'] = database
    db_args['dbuser'] = None
    db_args['dbpassword'] = None
    pp_db = data.PostProcessingDatabase(db_args)
    sm_db = data.SimulationMetadataDatabase(db_args)
    nm_db = data.NetworkModelDatabase(db_args)

    data_repository_file = args.experiment
    data_repository_file += "-"
    data_repository_file += "full-data.csv"

    # approach is to start at the end, and walk back toward the beginning
    # Start with SeriationCT SeriationAnnotationData object, get IDSS seriation params

    # BUT - to know the column headers, we have to either accumulate all the data in memory
    # first or we have to peek at what columns exist in the network model database so we can
    # then process a row at a time.
    nmodel = data.NetworkModelDatabase.objects()[0]
    params = nmodel.model_parameters

    annotated_obj = data.SeriationAnnotationData.objects

    # Get post processing step parameters, by walking explicitly backward using the
    # seriation input file to get the filtered info, then filtered to get assemblage sampling,
    # and so on.

    # After postprocessing steps, use the simulation run id to get the sim params

    # then get the network model info and NM parameters

    # since
    with open(data_repository_file, 'wb') as csvfile:
        fields = get_csv_header()
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writeheader()
Esempio n. 4
0
        simulation_run_id=sim_id).first()
    networkmodel = sim_run.networkmodel
    return networkmodel


if __name__ == "__main__":
    setup()
    database = args.experiment
    database += "_samples_raw"
    db_args = {}
    db_args['dbhost'] = args.dbhost
    db_args['dbport'] = args.dbport
    db_args['database'] = database
    db_args['dbuser'] = None
    db_args['dbpassword'] = None
    pp_db = data.PostProcessingDatabase(db_args)
    sm_db = data.SimulationMetadataDatabase(db_args)

    full_fname = args.inputfile
    root = parse_filename_into_root(full_fname)

    networkmodel = get_networkmodel_for_input(full_fname)
    # we use the actual TemporalNetwork
    netmodel = TemporalNetwork(networkmodel_path=networkmodel, sim_length=1000)

    time_map = netmodel.get_subpopulation_slice_ids()
    # log.debug("assemblage time_map: %s", time_map)

    # get the list of assemblages in order sorted by the origin time
    sorted_assemblage_names = sorted(time_map.keys(),
                                     key=operator.itemgetter(1))
def main():
    database = args.experiment
    database += "_samples_raw"
    db_args = {}
    db_args['dbhost'] = args.dbhost
    db_args['dbport'] = args.dbport
    db_args['database'] = database
    db_args['dbuser'] = None
    db_args['dbpassword'] = None
    pp_db = data.PostProcessingDatabase(db_args)

    log.info("Opening %s output files for seriation configuration", args.parallelism)
    num_files = int(args.parallelism)
    file_list = []
    base_name = "seriationjob-"
    base_name += args.experiment
    base_name += "-"

    for i in range(0, num_files):
        filename = ''
        if args.jobdirectory is not None:
            filename = args.jobdirectory + "/"
        filename += base_name
        filename += str(uuid.uuid4())
        filename += ".sh"

        log.debug("job file: %s", filename)
        f = open(filename, 'w')

        f.write("#!/bin/sh\n\n")
        file_list.append(f)

    file_cycle = itertools.cycle(file_list)

    # get a list of the input files from the database
    seriations = data.SeriationInputData.objects

    for s in seriations:
        input_file = s.seriation_input_file

        log.info("Processing input file: %s", input_file)

        root = parse_filename_into_root(input_file)

        outdir = os.getcwd() + '/' + args.outputdirectory + "/" + root

        try:
            os.mkdir(outdir)
        except:
            pass


        cmd = generate_seriation_commandline(input_file, outdir, s.xy_file_path, database, s.source_identifier)

        fc = file_cycle.next()
        log.debug("cmd: %s", cmd)
        fc.write(cmd)
        fc.write('\n')

    for fh in file_list:
        fh.close()
def main():
    database = args.experiment
    database += "_samples_raw"
    db_args = {}
    db_args['dbhost'] = args.dbhost
    db_args['dbport'] = args.dbport
    db_args['database'] = database
    db_args['dbuser'] = None
    db_args['dbpassword'] = None
    pp_db = data.PostProcessingDatabase(db_args)

    log.info("Opening %s output files for assemblage sampling jobs",
             args.parallelism)
    num_files = int(args.parallelism)
    file_list = []
    base_name = "assemsamplejob-"
    base_name += args.experiment
    base_name += "-"

    for i in range(0, num_files):
        filename = ''
        if args.jobdirectory is not None:
            filename = args.jobdirectory + "/"
        filename += base_name
        filename += str(uuid.uuid4())
        filename += ".sh"

        log.debug("job file: %s", filename)
        f = open(filename, 'w')

        f.write("#!/bin/sh\n\n")
        file_list.append(f)

    file_cycle = itertools.cycle(file_list)

    for file in os.listdir(args.inputdirectory):
        if fnmatch.fnmatch(file, '*.txt'):
            full_fname = args.inputdirectory
            full_fname += "/"
            full_fname += file

            if args.sampletype == 'random':
                cmd = generate_random_sample_command(full_fname)
            elif args.sampletype == 'spatial':
                cmd = generate_spatial_sample_command(full_fname)
            elif args.sampletype == 'temporal':
                cmd = generate_temporal_sample_command(full_fname)
            elif args.sampletype == 'spatiotemporal':
                cmd = generate_spatiotemporal_sample_command(full_fname)
            elif args.sampletype == 'complete':
                cmd = generate_complete_sample_command(full_fname)
            elif args.sampletype == 'excludelist':
                cmd = generate_exclusion_sample_command(full_fname)
            elif args.sampletype == 'slicestratified':
                cmd = generate_slicestratified_sample_command(full_fname)
            else:
                print "sampletype not recognized, fatal error"
                sys.exit(1)

            fc = file_cycle.next()
            log.debug("cmd: %s", cmd)
            fc.write(cmd)
            fc.write('\n')

    for fh in file_list:
        fh.close()
def doExport():
    database = args.experiment
    database += "_samples_raw"
    db_args = {}
    db_args['dbhost'] = args.dbhost
    db_args['dbport'] = args.dbport
    db_args['database'] = database
    db_args['dbuser'] = None
    db_args['dbpassword'] = None
    pp_db = data.PostProcessingDatabase(db_args)

    # the data cache has the following nested dict structure:  simid -> replicate -> subpop -> class:count

    cmap = DeepDefaultDict()
    sim_id_clean = args.simid[9:]
    cursor = data.ClassFrequencySampleUnaveraged.m.find(
        dict({'simulation_run_id': args.simid}), dict(timeout=False))

    for sample in cursor:
        rep = sample["replication"]
        subpop = sample["subpop"]

        class_count_map = sample["class_count"]

        for cls, count in class_count_map.items():
            cmap[rep][subpop][cls] += count

    # conditional either we sample trait counts (which will reduce the list of traits we put in the header),
    # or output the full list of counts (which will put every trait in the header)

    class_set = set()
    for rep in cmap.keys():
        for subpop in cmap[rep].keys():
            for cls, count in cmap[rep][subpop].items():
                class_set.add(cls)

    log.info("total number of classes: %s", len(class_set))

    for rep in cmap.keys():

        outputfile = args.outputdirectory + "/" + sim_id_clean + "-" + str(
            rep) + ".txt"

        class_set = set()

        with open(outputfile, 'wb') as outfile:
            for sp in cmap[rep].keys():
                for cls in cmap[rep][sp].keys():
                    class_set.add(cls)

            class_list = list(class_set)

            # write header row
            header = "Assemblage_Name"
            for cls in class_list:
                header += "\t"
                header += cls
            header += "\n"

            outfile.write(header)

            for sp in cmap[rep].keys():
                row = sp
                for cls in class_list:
                    row += "\t"
                    count = cmap[rep][sp][cls]
                    row += str(int(count)) if count != {} else str(0)
                row += "\n"
                outfile.write(row)

    pp_db.store_exported_datafile(args.simid, outputfile)