Beispiel #1
0
def create_segmentations(directory, data_sets, splits, assignments, size,
                         segmentation_size, crop, cats, test_limit,
                         single_process, verbose):
    '''
    Phase 5 of unification.  Create the normalized segmentation files
    '''
    if size is not None and segmentation_size is None:
        segmentation_size = size
    # Get assignments into a nice form, once, here.
    # (dataset, category): [numpy array with new indexes]
    index_max = build_histogram([((ds, cat), i)
                                 for ds, cat, i in assignments.keys()], max)
    index_mapping = dict([k, numpy.zeros(i + 1, dtype=numpy.int16)]
                         for k, i in index_max.items())
    for (ds, cat, oldindex), newindex in assignments.items():
        index_mapping[(ds, cat)][oldindex] = newindex
    # Count frequency and coverage for each individual image
    segmented = map_in_pool(partial(translate_segmentation,
                                    directory=directory,
                                    mapping=index_mapping,
                                    size=size,
                                    segmentation_size=segmentation_size,
                                    categories=cats,
                                    crop=crop,
                                    verbose=verbose),
                            all_dataset_segmentations(data_sets, test_limit),
                            single_process=single_process,
                            verbose=verbose)
    # Sort nonempty itesm randomly+reproducibly by md5 hash of the filename.
    ordered = sorted([(hashed_float(r['image']), r) for r in segmented if r])
    # Assign splits, pullout out last 20% for validation.
    cutoffs = cumulative_splits(splits)
    for floathash, record in ordered:
        for name, cutoff in cutoffs:
            if floathash <= cutoff:
                record['split'] = name
                break
        else:
            assert False, 'hash %f exceeds last split %f' % (floathash, c)

    # Now write one row per image and one column per category
    with open(os.path.join(directory, 'index.csv'), 'w') as csvfile:
        fields = ['image', 'split', 'ih', 'iw', 'sh', 'sw'] + cats
        writer = DictUnicodeWriter(csvfile, fieldnames=fields)
        writer.writeheader()
        for f, record in ordered:
            writer.writerow(record)
Beispiel #2
0
def write_label_files(directory, names, assignments, frequency, coverage, syns,
                      verbose):
    '''
    Phase 4 of unification.
    Collate some stats and then write then to two metadata files.
    '''
    # Make lists of synonyms claimed by each label
    synmap = invert_dict(dict(
        (w, assignments[lab]) for w, lab in syns.items()))
    # We need an (index, category) count
    ic_freq = join_histogram_fn(frequency, lambda x: (assignments[x], x[1]))
    ic_cov = join_histogram_fn(coverage, lambda x: (assignments[x], x[1]))
    for z in [(j, cat) for j, cat in ic_freq if j == 0]:
        del ic_freq[z]
        del ic_cov[z]
    catstats = [[] for n in names]
    # For each index, get a (category, frequency) list in descending order
    for (ind, cat), f in sorted(ic_freq.items(), key=lambda x: -x[1]):
        catstats[ind].append((cat, f))
    index_coverage = join_histogram(coverage, assignments)
    with open(os.path.join(directory, 'label.csv'), 'w') as csvfile:
        fields = [
            'number', 'name', 'category', 'frequency', 'coverage', 'syns'
        ]
        writer = DictUnicodeWriter(csvfile, fieldnames=fields)
        writer.writeheader()
        for ind, name in enumerate(names):
            if ind == 0:
                continue
            writer.writerow(
                dict(number='%d' % ind,
                     name=name,
                     category=';'.join('%s(%d)' % s for s in catstats[ind]),
                     frequency='%d' % sum(f for c, f in catstats[ind]),
                     coverage='%f' % index_coverage[ind],
                     syns=';'.join([s for s in synmap[ind] if s != name])))
    # For each category, figure the first, last, and other stats
    cat_ind = [(cat, ind) for ind, cat in ic_freq.keys()]
    first_index = build_histogram(cat_ind, min)
    last_index = build_histogram(cat_ind, max)
    count_labels = build_histogram([(cat, 1) for cat, _ in cat_ind])
    cat_freq = join_histogram_fn(ic_freq, lambda x: x[1])
    cats = sorted(first_index.keys(), key=lambda x: first_index[x])
    with open(os.path.join(directory, 'category.csv'), 'w') as csvfile:
        fields = ['name', 'first', 'last', 'count', 'frequency']
        writer = DictUnicodeWriter(csvfile, fieldnames=fields)
        writer.writeheader()
        for cat in cats:
            writer.writerow(
                dict(name=cat,
                     first=first_index[cat],
                     last=last_index[cat],
                     count=count_labels[cat],
                     frequency=cat_freq[cat]))
    # And for each category, create a dense coding file.
    for cat in cats:
        dense_code = [0] + sorted([i for i, c in ic_freq if c == cat],
                                  key=lambda i: (-ic_freq[(i, cat)], -ic_cov[
                                      (i, cat)]))
        fields = ['code', 'number', 'name', 'frequency', 'coverage']
        with open(os.path.join(directory, 'c_%s.csv' % cat), 'w') as csvfile:
            writer = DictUnicodeWriter(csvfile, fieldnames=fields)
            writer.writeheader()
            for code, i in enumerate(dense_code):
                if code == 0:
                    continue
                writer.writerow(
                    dict(code=code,
                         number=i,
                         name=names[i],
                         frequency=ic_freq[(i, cat)],
                         coverage=ic_cov[(i, cat)]))
    return cats