Beispiel #1
0
def _get_categories(label_list):
    '''
    The input is a list of strings, hopefully that looks like
    ['old-0', 'old-1', 'new-0', 'new-1', 'brwd-0', 'brwd-1', 'blue-0','blue-1']
    i.e., category names repeated in pairs with alternating -0 and -1
    suffixes.  List length must be even, size 2 or larger.
    This returns the category names, in order they were found, for example,
    ['old', 'new', 'brwd', 'blue'].
    If the format is off this raises an exception.
    '''
    if len(label_list) < 2 or len(label_list) % 2:
        raise Exception('Label list bad length ' + str(len(label_list)))
    c = []
    for i, l in enumerate(label_list):
        if len(l) < 3:
            raise Exception('Bad label ' + l + ' in label list')
        s = l[:-2]
        if l[-2] == '_':
            ties_log.log_debug('Warning: label line has underscores ' +
                               'not dashes')
            l = l[:-3] + '-' + l[
                -1]  # i.e., l[-2]='-' but Python disallows it.
        if i % 2:
            if s != c[-1] or l[-2:] != '-1':
                raise Exception('Failed alternation 1 in label line')
        else:
            if l[-2:] != '-0':
                raise Exception('Failed alternation 0 in label line')
            c.append(s)
    return c
Beispiel #2
0
def gen_ideal_behavior_plots(args):
    'Loop through given directories, validate files, outsource the plotting.'
    ide = []
    for d in glob.iglob(os.path.join(args.charmod_dirname, '*')):
        with open(os.path.join(d, 'moderator_info.txt')) as fmod:
            #           names of the moderators
            modnames = fmod.readline().split()
            #           percentiles of the moderators in this plot
            modptils = fmod.readline().split()
            #           values of the moderators in this plot
            modvalus = fmod.readline().split()
        ties_log.log_debug('modnames: ' + str(modnames) + '\nmodptils: ' +
                           str(modptils) + '\nmodvalues: ' + str(modvalus) +
                           '\n')
        if len(modnames) < 1:
            raise Exception('Not enough moderator names from ' + str(modnames))
        if len(modnames) != len(modptils):
            raise Exception('Incompatible number of percentiles: ' +
                            str(modnames) + ' ' + str(modptils))
        if len(modvalus) != len(modnames):
            raise Exception('Incompatible number of mod values: ' +
                            str(modnames) + ' ' + str(modvalus))
        with open(os.path.join(d, 'obs_states.txt')) as fstat:
            #           names of observables, line 1
            obsnames = fstat.readline().split()
            #           values of observbs, lines 2+
            obsvals = [l.split() for l in fstat]
        ties_log.log_debug('obsnames: ' + str(obsnames) + '\n')
        if len(obsnames) < 1:
            raise Exception('Not enough observables from ' + str(obsnames))
        if len(obsnames) != min(len(l) for l in obsvals):
            raise Exception('Observable table has a too-small row:\n' +
                            repr(obsvals))
        if len(obsnames) != max(len(l) for l in obsvals):
            raise Exception('Observable table has a too-big row:\n' +
                            repr(obsvals))
        ide.append(
            IdealizedDyad(zip(modnames, modptils, modvalus), obsnames, obsvals,
                          args.output_dir_plots, os.path.basename(d)))


#   We want the y axis be the same for all plots.
#   These two lines find global min and max values so we can do that.
    iomax = max(i.obs_max() for i in ide)
    iomin = min(i.obs_min() for i in ide)

    for i in ide:
        i.plot(iomin, iomax)
Beispiel #3
0
def parse_my_args():
    '''
    Parse the command-line arguments to this script; echo them to debug log.
    Sanity-check some of the semantics of the arguments.
    '''
    args = _build_my_parser().parse_args()
    ties_log.log_debug('\n' + _debug_echo_args(args))
    _if_silly_args_then_warn(args)

    #if args.config_baseline is not None:
    #    if not os.path.isfile(args.config_baseline):
    #        raise Exception('config-baseline arg is not a regular file.')

    #if args.config_ties is not None:
    #if not os.path.isfile(args.config_ties):
    #raise Exception('config-ties argument is not a regular file.')

    arg_is_dir = lambda a: a is not None and os.path.isdir(a)
    arg_is_file = lambda a: a is not None and os.path.isfile(a)
    arg_is_mode_x = lambda a: a is not None and os.access(a, os.X_OK)

    #if args.single_fold is None:
    #if not arg_is_file(args.data_csv_fname):
    #raise Exception('Input CSV filename missing or bad')
    #if not args.skip_baseline_models:
    #if not arg_is_mode_x(args.binary_baseline_engine):
    #raise Exception('Baseline binary ought to be executable')
    #skip_ties = args.moderator is None and args.categorical_mod is None
    #if not skip_ties and not arg_is_mode_x(args.binary_ties_engine):
    #    raise Exception('TIES cross_validate binary is not executable')
    #else:
    #if not arg_is_mode_x(args.binary_run_fold):
    #raise Exception('TIES run_fold binary is not executable')
    #if not arg_is_dir(args.data_dir):
    #raise Exception('TIES data directory missing or bad')
    #if not arg_is_file(args.id_list):
    #raise Exception('TIES id-list filename is missing or bad')
    #if int(args.single_fold) < 1:
    #raise Exception('TIES fold number must be a positive integer')

    return args
Beispiel #4
0
def _get_dyad_observables(fname_list):
    '''
    Input: list of pathnames to files, each storing a dyad's observable data.
    Dyad number is encoded in the pathname.  File format is set by C++ code.

    Output:  a multi-level dictionary.  First index:  observable category name
    (like "dial" or "respiration" but no individual specifier:  not "dial-1").
    Second index: dyad number (as integer).  Third index: time step (integer).
    That yields a 2-tuple of floats, corr. to individual 0 and 1 respectively,
    with an exception for missing data.  If any data are missing, there's None
    in the 2-tuple instead of a float (in either or both positions).

    File format has a prologue before the data.  The prologue, which we skip,
    ends with a line that looks like "time  dial-0  dial-1" (example)
    in the case that the observable category is called "dial."  The observable
    can be called anything, though.  Anyway, we look for a line (the last,
    actually) with "time" as the first whitespace-separated token on the line,
    and assume it is the last line (the "time-line") of the prologue.
    '''
    all_obs = {}  # index by observable category name, then dyad number,
    # then time, then individual.
    ties_log.log_debug('call get_dyad_observables() with ' + repr(fname_list))
    for fname in fname_list:
        assert fname.endswith('.txt'), 'last four chars describe file-type'
        dyadnum = int(
            os.path.basename(fname)[:-4])  # dyad number of input data

        # Read file, scan for heading line and observable names.
        with open(fname, 'r') as fd:
            flines = [dl.split() for dl in fd]
        timeline = max([n for n, a in enumerate(flines) if 'time' == a[0]])
        obsline = max(
            [n for n, a in enumerate(flines) if 'observables:' == a[0]])
        obsv_names = flines[obsline][1:]

        # Initialize dictionary if necessary, otherwise verify obsv_names.
        if 0 == len(all_obs):
            all_obs = {o: {} for o in obsv_names}
        elif sorted(all_obs.keys()) != sorted(obsv_names):
            raise Exception('Observables disagree across data files')
        arity = len(flines[timeline]) - 1  # number of data columns
        if arity % 2 != 0 or len(obsv_names) != arity / 2:
            raise Exception('bad arity parity')

        # Verify heading names, like obsv_names with '-0' and '-1' interleaved.
        ohead = ' '.join([(o + '-0 ' + o + '-1') for o in obsv_names]).split()
        if flines[timeline][1:] != ohead:
            raise Exception('Observable line and timeline disagree')

        # Add blank dictionary of time:(None,None) for this obsv. and dyad.
        for on in obsv_names:
            all_obs[on][dyadnum] \
                = {int(l[0]) : (None, None) for l in flines[timeline+1:]}
        cio = {i: on for i, on in enumerate(obsv_names)}

        # Read each time-observables line (a list of strings of numbers).
        for toblin in flines[timeline + 1:]:
            if len(toblin) != 1 + arity:
                raise Exception('wrong len data line ' + fname + ', ' +
                                toblin[0])
            t = int(toblin[0])  # time index
            for oi, ov in enumerate(toblin[1:]):
                on = cio[oi / 2]  # observable name of category (e.g., 'dial')
                dd = all_obs[on][dyadnum][t]  # get prior value, to augment
                # Missing values in input are marked with a sentinel value.
                ov = float(ov) if ov != MISSING_SENTINEL else None
                if oi % 2:
                    all_obs[on][dyadnum][t] = (dd[0], ov)
                else:
                    all_obs[on][dyadnum][t] = (ov, dd[1])
    return all_obs
Beispiel #5
0
def _gen_model_plts(modelname, dnam, plotdir, data, modelfns):
    '''
    Inputs correspond to all dyads, one flavor of model (maybe a baseline,
    maybe the TIES model).  Output is a series of plots which are written
    into the plot directory named in 'plotdir.'

    Input 'modelname' describes the model, either (1) in a human-readable form
    (i.e., using spaces, suitable for a plot title, and not suitable for a
    filename), in case of a baseline model; or (2) sentinel value
    IMPLICIT_COMPTIES_MODELNAME if it is the main CompTIES model result.

    Input 'dnam' is the string name of the distinguisher category.

    Input 'data' is a multi-level dictionary of all dyads' observable data.
    See _get_dyad_observables() for the format (because 'data' is the output
    of that function).

    Input 'modelfns' is a list of filenames of all dyads' model state.  The
    dyad number is extracted from the filename; this part is a bit arcane.
    The file format is basically columnar text delimited by whitespace.
    The array is rectangular.  The first line has labels made of the observable
    names with a person suffix (e.g., respiration-0 respiration-1 bp-0 bp-1).
    Sadly the columns label order does not necessarily align with the order
    in the data files, which is why 'data' has a somewhat arcane structure.
    We have to parse the first line to determine what the columns mean.
    '''
    ties_log.log_debug('_gen_model_plts for modelname=' + modelname +
                       ', plotdir=' + plotdir + ', modelfns=' +
                       repr(modelfns) + ', len(data) is ' + str(len(data)) +
                       '\n')
    '''for o in data.keys():
        if len(modelfns) != len(data[o]):
            raise Exception('num. dyads of data and model differ')'''
    ties_log.failsafe_mkdir(plotdir)
    for m in modelfns:
        # Extract dyad number from filename
        dyma = dynum_regex.search(m)
        if dyma is None:
            raise Exception('model state path ' + m + ' lacks dyad number')
        dyadnum = dyma.group(1)  # dyad number of this model file . . .

        # Get starting time index
        t0 = None
        for ov in data.itervalues():
            if t0 is None:
                t0 = min(ov[int(dyadnum)].keys())  # first time index
            elif min(ov[int(dyadnum)].keys()) != t0:
                raise Exception('Ragged data: file ' + m + ' dyad ' + dyadnum +
                                ' has irregular start time')
        assert t0 is not None, 'found starting time index t0 already'

        # Read the whole file (barely parsed, as rectangular array of strings)
        with open(m, 'r') as model_file:
            ll = model_file.readline().split()  # First line is the label line.
            mlines = [[float(x) for x in ml.split()] for ml in model_file]

        # Given the file's column-pair number, which observ. name corresponds?
        cio = {i: o for i, o in enumerate(_get_categories(ll))}
        if sorted(cio.values()) != sorted(data.keys()):
            raise Exception('Model file, data file have different observables')

        # Build dictionary of model-state data in the file, i.e., parse harder.
        pd = {oc: {}
              for oc in data.iterkeys()}  # first index is observable name
        for t, ms in enumerate(mlines):  # parse each line
            t += t0
            for j, s in enumerate(ms):  # parse each column
                oc = cio[j / 2]  # observable category name for column j
                if t not in pd[oc]:
                    assert 0 == j % 2, 'start of tuple is for person 0'
                    pd[oc][t] = (s, None)
                else:
                    assert 1 == j % 2, '2nd half of tuple is for person 1'
                    s2 = pd[oc][t]
                    assert 2 == len(s2) and s2[1] is None, 'halfway done'
                    pd[oc][t] = (s2[0], s)

        # Generate one plot per observable category found in this file
        for oc in data.iterkeys():
            d = data[oc][int(dyadnum)]  # data for this obsv category and dyad
            if len(d.keys()) != len(pd[oc].keys()):
                raise Exception('Model state time interval differs from data')
            assert sorted(d.keys()) == sorted(pd[oc].keys()), 'times match'

            # Generate plot filename and plot file
            fname = _get_fname(dyadnum, len(cio), oc, modelname, plotdir)
            _gen_a_model_plot(fname, oc, dnam, dyadnum, modelname, d, pd[oc])
Beispiel #6
0
        ide.append(
            IdealizedDyad(zip(modnames, modptils, modvalus), obsnames, obsvals,
                          args.output_dir_plots, os.path.basename(d)))


#   We want the y axis be the same for all plots.
#   These two lines find global min and max values so we can do that.
    iomax = max(i.obs_max() for i in ide)
    iomin = min(i.obs_min() for i in ide)

    for i in ide:
        i.plot(iomin, iomax)


def arg_is_dir(a):
    return a is not None and os.path.isdir(a)


if __name__ == '__main__':
    log_pid = ties_log.start_log_with_args()
    try:
        args = _build_im_parser().parse_args()
        if not arg_is_dir(args.output_dir_plots):
            raise Exception('plots directory missing or bad')
        if not arg_is_dir(args.charmod_dirname):
            raise Exception('model directory missing or bad')
        ties_log.log_debug(repr(args))
        gen_ideal_behavior_plots(args)
    finally:
        ties_log.stop_hourly_top_log(log_pid)
Beispiel #7
0
def generate_individual_plots(args):
    '''
    Generate all output plots of individuals of all dyads.
    This function "knows" about the pathnames to the baseline model fits,
    the TIES model fit, and the observable input data.
    Its helper functions "know" about the format of those files.
    If the TIES code changes that format, this function and its helpers will
    need retouching.
    Outputs are exported files written to the plots subdirectory.  That subdir
    is created if necessary, as named in output_dir_plots.  The baseline plots
    (if any) are written into a directory as named in output_dir_baselines.

    The arguments called 'output_dir_model' and 'output_dir_baselines' are
    confusing to a casual reader of this script; really they are inputs.
    They are so-named because those names come from ties_options.py, and are
    based on the point of view of the high-level caller.  From the point of
    view of this script, they are read-only.

    The logic in this function is crabbed and sclerotic.  Needs overhaul.

    Required names of 'args' namespace:
    * output_dir_baselines (misnomer, see above)
    * output_dir_model (misnomer, see above)
    * output_dir_plots
    * categorical_mod 
    * data_dir

    Maintenance note:  older versions of this function had one weird trick
    to work around a Condor bug, but the trick has been removed.  The bug was
    that Condor would misguidedly prune any output subdirectory that lacked
    any regular files.  That might sound harmless but entails pruning an
    output subdirectory containing other subdirectories -- thus throwing away
    their contents.  The bug was reported to iPlant but I was never informed
    of a fix, so I added a workaround for a partial-results situation, by
    adding a token regular file, and later removing it if I got full results
    (not just partial results).  Due to the fact that we don't use Condor
    anymore for CompTIES and that I reorganized the way output directories
    get created, this code became a liability and hence was removed.
    But if you return to Condor, the technique might be helpful,
    and that's why I'm memorializing the technique in this comment
    even though it is now vanished from the code.
    '''
    glob4d = '[0-9][0-9][0-9][0-9]'  # used for dyad identification number
    #baseline_dir = args.output_dir_baselines
    model_dir = args.output_dir_model
    data_dir = args.data_dir
    if args.data_dir is None:
        data_dir = os.path.join(model_dir, 'data')

    ties_log.log_debug(
        'gen_individual_plots() entry, bd='  #+ str(baseline_dir)
        + ', md = ' + str(model_dir) + ', dd = ' + str(data_dir))

    global MISSING_SENTINEL
    MISSING_SENTINEL = _get_missing_sentinel(data_dir)
    ties_log.log_debug('invalid text string: ' + str(MISSING_SENTINEL))

    dflist = glob.glob(os.path.join(data_dir, glob4d + '.txt'))
    if args.categorical_mod is not None:
        dflist = glob.glob(
            os.path.join(data_dir, args.categorical_mod + '-*',
                         glob4d + '.txt'))
    if args.single_fold is None:
        if args.categorical_mod is not None:
            fold_states = os.path.join(
                'fold-[0-9][0-9]',
                'testing',
                args.categorical_mod + '-*',
                glob4d,
                'all_states',
                #glob4d,
                'obs_states.txt')
        else:
            fold_states = os.path.join('fold-[0-9][0-9]', 'testing', glob4d,
                                       'all_states', 'obs_states.txt')
            #'obs_states.txt')
    else:
        if args.categorical_mod is not None:
            fold_states = os.path.join(args.categorical_mod + '-*', glob4d,
                                       'all_states', 'obs_states.txt')
            #glob4d, 'obs_states.txt')
        else:
            fold_states = os.path.join(glob4d, 'all_states', 'obs_states.txt')
            #fold_states = os.path.join(glob4d, 'obs_states.txt')

    mslist = glob.glob(os.path.join(model_dir, fold_states))
    ties_log.log_debug('generate individual plots() globs:\ndflist:\n' +
                       repr(dflist) + '\nmslist_c0:\n' + repr(mslist))
    dyad_obs = _get_dyad_observables(dflist)

    if 0 == len(dyad_obs):
        raise Exception('Must be at least one dyad in dataset')

    kfold_sorter = lambda fn: fn[fn.find('testing'):]

    if args.single_fold is not None:
        _gen_model_plts(
            args.model_name, args.distinguisher, args.output_dir_plots,
            dyad_obs,
            sorted(glob.glob(os.path.join(args.output_dir_model,
                                          fold_states))), args.categorical_mod)
    else:
        _gen_model_plts(args.model_name, args.distinguisher,
                        args.output_dir_plots, dyad_obs,
                        sorted(mslist, None,
                               kfold_sorter), args.categorical_mod)