Beispiel #1
0
    def biggest_dirs(drive):
        print('Biggest Dirs in %r' % (drive,))
        dpath_list = drive.dpath_list
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, dpath_list)
        unflat_dpath_bytes_list = ut.list_unflat_take(drive.fpath_bytes_list, fidxs_list)
        dpath_nbytes_list = list(map(sum, unflat_dpath_bytes_list))

        sortx = ut.list_argsort(dpath_nbytes_list)[::-1]
        sel = sortx[0:10]
        biggest_nbytes = ut.take(dpath_nbytes_list, sel)
        biggest_dpaths = ut.take(dpath_list, sel)
        biginfo_list = list(zip(map(ut.byte_str2, biggest_nbytes), biggest_dpaths))
        print(ut.list_str(biginfo_list, strvals=True))
        pass
Beispiel #2
0
    def biggest_dirs(drive):
        print('Biggest Dirs in %r' % (drive, ))
        dpath_list = drive.dpath_list
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, dpath_list)
        unflat_dpath_bytes_list = ut.list_unflat_take(drive.fpath_bytes_list,
                                                      fidxs_list)
        dpath_nbytes_list = list(map(sum, unflat_dpath_bytes_list))

        sortx = ut.list_argsort(dpath_nbytes_list)[::-1]
        sel = sortx[0:10]
        biggest_nbytes = ut.take(dpath_nbytes_list, sel)
        biggest_dpaths = ut.take(dpath_list, sel)
        biginfo_list = list(
            zip(map(ut.byte_str2, biggest_nbytes), biggest_dpaths))
        print(ut.repr2(biginfo_list, strvals=True))
        pass
Beispiel #3
0
def make_name_model(num_annots, num_names=None, verbose=True, mode=1):
    """
    Defines the general name model

    CommandLine:
        python -m wbia.algo.hots.bayes --exec-make_name_model --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from wbia.algo.hots.bayes import *  # NOQA
        >>> defaults = dict(num_annots=2, num_names=2, verbose=True, mode=2)
        >>> kw = ut.argparse_funckw(make_name_model, defaults)
        >>> model = make_name_model(**kw)
        >>> ut.quit_if_noshow()
        >>> show_model(model, show_prior=True)
        >>> ut.show_if_requested()
    """
    # annots = ut.chr_range(num_annots, base='a')
    mode = ut.get_argval('--mode', default=mode)
    annots = ut.chr_range(num_annots,
                          base=ut.get_argval('--base', default='a'))
    # The indexes of match CPDs will not change if another annotation is added
    upper_diag_idxs = ut.colwise_diag_idxs(num_annots, 2)
    if num_names is None:
        num_names = num_annots

    # -- Define CPD Templates
    def match_pmf(match_type, n1, n2):
        if n1 == n2:
            val = 1.0 if match_type == 'same' else 0.0
            # val = .999 if match_type == 'same' else 0.001
        elif n1 != n2:
            # val = 0.01 if match_type == 'same' else .99
            val = 0.0 if match_type == 'same' else 1.0
        return val

    def score_pmf(score_type, match_type):
        score_lookup = {
            'same': {
                'low': 0.1,
                'high': 0.9,
                'veryhigh': 0.9
            },
            'diff': {
                'low': 0.9,
                'high': 0.09,
                'veryhigh': 0.01
            }
            #'same': {'low': .1, 'high': .9},
            #'diff': {'low': .9, 'high': .1}
        }
        val = score_lookup[match_type][score_type]
        return val

    def score_pmf3(score_type, match_type, isdup='False'):
        score_lookup = {
            'False': {
                'same': {
                    'low': 0.1,
                    'high': 0.5,
                    'veryhigh': 0.4
                },
                'diff': {
                    'low': 0.9,
                    'high': 0.09,
                    'veryhigh': 0.01
                },
            },
            'True': {
                'same': {
                    'low': 0.01,
                    'high': 0.2,
                    'veryhigh': 0.79
                },
                'diff': {
                    'low': 0.4,
                    'high': 0.4,
                    'veryhigh': 0.2
                },
            },
        }
        val = score_lookup[isdup][match_type][score_type]
        return val

    def score_pmf2(score_type, n1, n2):
        score_lookup = {
            True: {
                'low': 0.1,
                'high': 0.4,
                'veryhigh': 0.5
            },
            False: {
                'low': 0.9,
                'high': 0.09,
                'veryhigh': 0.01
            },
        }
        val = score_lookup[n1 == n2][score_type]
        return val

    def dup_pmf(dupstate, match_type):
        lookup = {
            'same': {
                'True': 0.5,
                'False': 0.5
            },
            'diff': {
                'True': 0.0,
                'False': 1.0
            },
        }
        return lookup[match_type][dupstate]

    def check_pmf(n0, n1, match_type):
        pass

    def trimatch_pmf(match_ab, match_bc, match_ca):
        lookup = {
            'same': {
                'same': {
                    'same': 1,
                    'diff': 0
                },
                'diff': {
                    'same': 0,
                    'diff': 1
                },
            },
            'diff': {
                'same': {
                    'same': 0,
                    'diff': 1
                },
                'diff': {
                    'same': 0.5,
                    'diff': 0.5
                },
            },
        }
        return lookup[match_ca][match_bc][match_ab]

    name_cpd_t = pgm_ext.TemplateCPD('name', ('n', num_names),
                                     varpref='N',
                                     special_basis_pool=SPECIAL_BASIS_POOL)

    if mode == 1 or mode == 5:
        match_cpd_t = pgm_ext.TemplateCPD(
            'match',
            ['diff', 'same'],
            varpref='M',
            evidence_ttypes=[name_cpd_t, name_cpd_t],
            pmf_func=match_pmf,
        )

        if mode == 5:
            trimatch_cpd_t = pgm_ext.TemplateCPD(
                'tri_match',
                ['diff', 'same'],
                varpref='T',
                # evidence_ttypes=[match_cpd_t, match_cpd_t, match_cpd_t],
                evidence_ttypes=[match_cpd_t, match_cpd_t],
                pmf_func=trimatch_pmf,
            )

            score_cpd_t = pgm_ext.TemplateCPD(
                #'score', ['low', 'high', 'veryhigh'],
                'score',
                ['low', 'high'],
                varpref='S',
                evidence_ttypes=[match_cpd_t],
                pmf_func=score_pmf,
            )
        else:
            score_cpd_t = pgm_ext.TemplateCPD(
                #'score', ['low', 'high', 'veryhigh'],
                'score',
                ['low', 'high'],
                varpref='S',
                evidence_ttypes=[match_cpd_t],
                pmf_func=score_pmf,
            )

    elif mode == 2:
        name_cpd_t = pgm_ext.TemplateCPD('name', ('n', num_names),
                                         varpref='N',
                                         special_basis_pool=SPECIAL_BASIS_POOL)
        score_cpd_t = pgm_ext.TemplateCPD(
            #'score', ['low', 'high', 'veryhigh'],
            'score',
            ['low', 'high'],
            varpref='S',
            evidence_ttypes=[name_cpd_t, name_cpd_t],
            pmf_func=score_pmf2,
        )
    elif mode == 3 or mode == 4:
        match_cpd_t = pgm_ext.TemplateCPD(
            'match',
            ['diff', 'same'],
            varpref='M',
            evidence_ttypes=[name_cpd_t, name_cpd_t],
            pmf_func=match_pmf,
        )
        if mode == 3:
            dup_cpd_t = pgm_ext.TemplateCPD('dup', ['False', 'True'],
                                            varpref='D')
        else:
            dup_cpd_t = pgm_ext.TemplateCPD(
                'dup',
                ['False', 'True'],
                varpref='D',
                evidence_ttypes=[match_cpd_t],
                pmf_func=dup_pmf,
            )
        score_cpd_t = pgm_ext.TemplateCPD(
            'score',
            ['low', 'high', 'veryhigh'],
            varpref='S',
            evidence_ttypes=[match_cpd_t, dup_cpd_t],
            pmf_func=score_pmf3,
        )

    # Instanciate templates

    if mode == 1 or mode == 5:
        name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots]
        namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs)
        match_cpds = [
            match_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds
        ]
        score_cpds = [
            score_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds)
        ]
        if mode == 5:
            # triple_idxs = ut.colwise_diag_idxs(num_annots, 3)
            tid2_match = {cpd._template_id: cpd for cpd in match_cpds}
            trimatch_cpds = []
            # such hack
            for cpd in match_cpds:
                parents = []
                this_ = list(cpd._template_id)
                for aid in annots:
                    if aid in this_:
                        continue
                    for aid2 in this_:
                        key = aid2 + aid
                        if key not in tid2_match:
                            key = aid + aid2
                        parents += [tid2_match[key]]
                trimatch_cpds += [trimatch_cpd_t.new_cpd(parents=parents)]

            # score_cpds = [score_cpd_t.new_cpd(parents=cpds)
            #              for cpds in zip(trimatch_cpds)]

            cpd_list = name_cpds + score_cpds + match_cpds + trimatch_cpds
        else:
            cpd_list = name_cpds + score_cpds + match_cpds
    elif mode == 2:
        name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots]
        namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs)
        score_cpds = [
            score_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds
        ]
        cpd_list = name_cpds + score_cpds
    elif mode == 3 or mode == 4:
        name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots]
        namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs)
        match_cpds = [
            match_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds
        ]
        if mode == 3:
            dup_cpds = [
                dup_cpd_t.new_cpd(parents=''.join(map(str, aids)))
                for aids in ut.list_unflat_take(annots, upper_diag_idxs)
            ]
        else:
            dup_cpds = [
                dup_cpd_t.new_cpd(parents=[mcpds]) for mcpds in match_cpds
            ]
        score_cpds = [
            score_cpd_t.new_cpd(parents=([mcpds] + [dcpd]))
            for mcpds, dcpd in zip(match_cpds, dup_cpds)
        ]
        cpd_list = name_cpds + score_cpds + match_cpds + dup_cpds

    # logger.info('upper_diag_idxs = %r' % (upper_diag_idxs,))
    logger.info('score_cpds = %r' %
                (ut.list_getattr(score_cpds, 'variable'), ))
    # import sys
    # sys.exit(1)

    # Make Model
    model = pgm_ext.define_model(cpd_list)
    model.num_names = num_names

    if verbose:
        model.print_templates()
        # ut.colorprint('\n --- CPD Templates ---', 'blue')
        # for temp_cpd in templates:
        #    ut.colorprint(temp_cpd._cpdstr('psql'), 'cyan')
    # print_ascii_graph(model)
    return model
Beispiel #4
0
def name_model_mode1(num_annots, num_names=None, verbose=True):
    r"""
    spaghettii

    CommandLine:
        python -m wbia.algo.hots.bayes --exec-name_model_mode1 --show
        python -m wbia.algo.hots.bayes --exec-name_model_mode1
        python -m wbia.algo.hots.bayes --exec-name_model_mode1 --num-annots=3

    Example:
        >>> # DISABLE_DOCTEST
        >>> from wbia.algo.hots.bayes import *  # NOQA
        >>> defaults = dict(num_annots=2, num_names=2, verbose=True)
        >>> kw = ut.argparse_funckw(name_model_mode1, defaults)
        >>> model = name_model_mode1(**kw)
        >>> ut.quit_if_noshow()
        >>> show_model(model, show_prior=False, show_title=False)
        >>> ut.show_if_requested()

    Ignore:
        import nx2tikz
        logger.info(nx2tikz.dumps_tikz(model, layout='layered', use_label=True))
    """
    annots = ut.chr_range(num_annots,
                          base=ut.get_argval('--base', default='a'))
    # The indexes of match CPDs will not change if another annotation is added
    upper_diag_idxs = ut.colwise_diag_idxs(num_annots, 2)
    if num_names is None:
        num_names = num_annots

    # +--- Define CPD Templates ---

    # +-- Name Factor ---
    name_cpd_t = pgm_ext.TemplateCPD('name', ('n', num_names),
                                     varpref='N',
                                     special_basis_pool=SPECIAL_BASIS_POOL)
    name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots]

    # +-- Match Factor ---
    def match_pmf(match_type, n1, n2):
        return {
            True: {
                'same': 1.0,
                'diff': 0.0
            },
            False: {
                'same': 0.0,
                'diff': 1.0
            },
        }[n1 == n2][match_type]

    match_cpd_t = pgm_ext.TemplateCPD(
        'match',
        ['diff', 'same'],
        varpref='M',
        evidence_ttypes=[name_cpd_t, name_cpd_t],
        pmf_func=match_pmf,
    )
    namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs)
    match_cpds = [match_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds]

    # +-- Score Factor ---
    def score_pmf(score_type, match_type):
        score_lookup = {
            'same': {
                'low': 0.1,
                'high': 0.9,
                'veryhigh': 0.9
            },
            'diff': {
                'low': 0.9,
                'high': 0.09,
                'veryhigh': 0.01
            },
        }
        val = score_lookup[match_type][score_type]
        return val

    score_cpd_t = pgm_ext.TemplateCPD(
        'score',
        ['low', 'high'],
        varpref='S',
        evidence_ttypes=[match_cpd_t],
        pmf_func=score_pmf,
    )
    score_cpds = [
        score_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds)
    ]

    # L___ End CPD Definitions ___

    cpd_list = name_cpds + score_cpds + match_cpds
    logger.info('score_cpds = %r' %
                (ut.list_getattr(score_cpds, 'variable'), ))

    # Make Model
    model = pgm_ext.define_model(cpd_list)
    model.num_names = num_names

    if verbose:
        model.print_templates()
    return model
Beispiel #5
0
def name_model_mode5(num_annots, num_names=None, verbose=True, mode=1):
    mode = ut.get_argval('--mode', default=mode)
    annots = ut.chr_range(num_annots,
                          base=ut.get_argval('--base', default='a'))
    # The indexes of match CPDs will not change if another annotation is added
    upper_diag_idxs = ut.colwise_diag_idxs(num_annots, 2)
    if num_names is None:
        num_names = num_annots

    # -- Define CPD Templates

    name_cpd_t = pgm_ext.TemplateCPD('name', ('n', num_names),
                                     varpref='N',
                                     special_basis_pool=SPECIAL_BASIS_POOL)
    name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots]

    def match_pmf(match_type, n1, n2):
        return {
            True: {
                'same': 1.0,
                'diff': 0.0
            },
            False: {
                'same': 0.0,
                'diff': 1.0
            },
        }[n1 == n2][match_type]

    match_cpd_t = pgm_ext.TemplateCPD(
        'match',
        ['diff', 'same'],
        varpref='M',
        evidence_ttypes=[name_cpd_t, name_cpd_t],
        pmf_func=match_pmf,
    )
    namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs)
    match_cpds = [match_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds]

    def trimatch_pmf(match_ab, match_bc, match_ca):
        lookup = {
            'same': {
                'same': {
                    'same': 1,
                    'diff': 0
                },
                'diff': {
                    'same': 0,
                    'diff': 1
                },
            },
            'diff': {
                'same': {
                    'same': 0,
                    'diff': 1
                },
                'diff': {
                    'same': 0.5,
                    'diff': 0.5
                },
            },
        }
        return lookup[match_ca][match_bc][match_ab]

    trimatch_cpd_t = pgm_ext.TemplateCPD(
        'tri_match',
        ['diff', 'same'],
        varpref='T',
        evidence_ttypes=[match_cpd_t, match_cpd_t],
        pmf_func=trimatch_pmf,
    )
    # triple_idxs = ut.colwise_diag_idxs(num_annots, 3)
    tid2_match = {cpd._template_id: cpd for cpd in match_cpds}
    trimatch_cpds = []
    # such hack
    for cpd in match_cpds:
        parents = []
        this_ = list(cpd._template_id)
        for aid in annots:
            if aid in this_:
                continue
            for aid2 in this_:
                key = aid2 + aid
                if key not in tid2_match:
                    key = aid + aid2
                parents += [tid2_match[key]]
        trimatch_cpds += [trimatch_cpd_t.new_cpd(parents=parents)]

    def score_pmf(score_type, match_type):
        score_lookup = {
            'same': {
                'low': 0.1,
                'high': 0.9,
                'veryhigh': 0.9
            },
            'diff': {
                'low': 0.9,
                'high': 0.09,
                'veryhigh': 0.01
            },
        }
        val = score_lookup[match_type][score_type]
        return val

    score_cpd_t = pgm_ext.TemplateCPD(
        'score',
        ['low', 'high'],
        varpref='S',
        evidence_ttypes=[match_cpd_t],
        pmf_func=score_pmf,
    )
    score_cpds = [
        score_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds)
    ]

    # score_cpds = [score_cpd_t.new_cpd(parents=cpds)
    #              for cpds in zip(trimatch_cpds)]

    cpd_list = name_cpds + score_cpds + match_cpds + trimatch_cpds
    logger.info('score_cpds = %r' %
                (ut.list_getattr(score_cpds, 'variable'), ))

    # Make Model
    model = pgm_ext.define_model(cpd_list)
    model.num_names = num_names

    if verbose:
        model.print_templates()
    return model
Beispiel #6
0
def make_name_model(num_annots, num_names=None, verbose=True, mode=1,
                    num_scores=2, p_score_given_same=None,
                    hack_score_only=False, score_basis=None,
                    special_names=None):
    r"""
    CommandLine:
        python -m ibeis.algo.hots.bayes --exec-make_name_model --show
        python -m ibeis.algo.hots.bayes --exec-make_name_model
        python -m ibeis.algo.hots.bayes --exec-make_name_model --num-annots=3

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.hots.bayes import *  # NOQA
        >>> defaults = dict(num_annots=2, num_names=2, verbose=True)
        >>> modeltype = ut.get_argval('--modeltype', default='bayes')
        >>> kw = ut.argparse_funckw(make_name_model, defaults)
        >>> model = make_name_model(**kw)
        >>> ut.quit_if_noshow()
        >>> model.show_model(show_prior=False, show_title=False, modeltype=modeltype)
        >>> ut.show_if_requested()
    """
    if special_names is None:
        special_names = SPECIAL_BASIS_POOL

    assert mode == 1, 'only can do mode 1'
    base = ut.get_argval('--base', type_=str, default='a')
    annots = ut.chr_range(num_annots, base=base)
    # The indexes of match CPDs will not change if another annotation is added
    upper_diag_idxs = ut.colwise_diag_idxs(num_annots, 2)
    if hack_score_only:
        upper_diag_idxs = upper_diag_idxs[-hack_score_only:]

    if num_names is None:
        num_names = num_annots

    # +--- Define CPD Templates and Instantiation ---
    cpd_list = []

    # Name Factor
    name_cpd_t = pgm_ext.TemplateCPD(
        'name', ('n', num_names),
        special_basis_pool=special_names)
    name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots]
    #name_cpds = [name_cpd_t.new_cpd(parents=aid, constrain_state=count)
    #             for count, aid in enumerate(annots, start=1)]
    cpd_list.extend(name_cpds)

    # Match Factor
    def match_pmf(match_type, n1, n2):
        return {
            True: {'same': 1.0, 'diff': 0.0},
            False: {'same': 0.0, 'diff': 1.0},
        }[n1 == n2][match_type]
    match_states = ['diff', 'same']
    match_cpd_t = pgm_ext.TemplateCPD(
        'match', match_states,
        evidence_ttypes=[name_cpd_t, name_cpd_t], pmf_func=match_pmf)
    namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs)
    match_cpds = [match_cpd_t.new_cpd(parents=cpds)
                  for cpds in namepair_cpds]
    cpd_list.extend(match_cpds)

    # Score Factor
    score_states = list(range(num_scores))
    if score_basis is not None:
        score_states = ['%.2f' % (s,) for s in score_basis]
    if p_score_given_same is None:
        tmp = np.arange(num_scores + 1)[1:]
        tmp = np.cumsum(tmp)
        tmp = (tmp / tmp.sum())
        p_score_given_same = tmp
    def score_pmf(score_type, match_type):
        if isinstance(score_type, six.string_types):
            score_type = score_states.index(score_type)
        if match_type == 'same':
            return p_score_given_same[score_type]
        else:
            return p_score_given_same[-(score_type + 1)]
    score_cpd_t = pgm_ext.TemplateCPD(
        'score', score_states,
        evidence_ttypes=[match_cpd_t], pmf_func=score_pmf)
    score_cpds = [score_cpd_t.new_cpd(parents=cpds)
                  for cpds in zip(match_cpds)]
    cpd_list.extend(score_cpds)

    with_humans = False
    if with_humans:
        human_states = ['diff', 'same']
        human_cpd_t = pgm_ext.TemplateCPD(
            'human', human_states,
            evidence_ttypes=[match_cpd_t], pmf_func=[[.9, .1], [.1, .9]])
        human_cpds = [human_cpd_t.new_cpd(parents=cpds)
                      for cpds in zip(match_cpds)]
        cpd_list.extend(human_cpds)

    with_rank = False  # Rank depends on dependant scores
    if with_rank:
        rank_states = ['0', '1', '2', '3']
        rank_cpd_t = pgm_ext.TemplateCPD(
            'rank', rank_states,
            evidence_ttypes=[match_cpd_t], pmf_func=None)
        rank_cpds = [rank_cpd_t.new_cpd(parents=cpds)
                      for cpds in zip(match_cpds)]
        cpd_list.extend(rank_cpds)

    # L___ End CPD Definitions ___

    print('score_cpds = %r' % (ut.list_getattr(score_cpds, 'variable'),))

    # Make Model
    model = pgm_ext.define_model(cpd_list)
    model.num_names = num_names

    if verbose:
        model.print_templates(ignore_ttypes=['match'])
    return model
Beispiel #7
0
    def get_col(table, tbl_rowids, colnames=None):
        """
        colnames = ('mask', 'size')

        FIXME; unpacking is confusing with sql controller
        """
        # print('Get prop of %r, colnames=%r' % (table, colnames))
        try:
            request_unpack = False
            if colnames is None:
                colnames = table.data_colnames
                #table._internal_data_colnames
            else:
                if isinstance(colnames, six.text_type):
                    request_unpack = True
                    colnames = (colnames,)
            # print('* colnames = %r' % (colnames,))

            eager = True
            nInput = None

            total = 0
            intern_colnames = []
            extern_resolve_colxs = []
            nesting_xs = []

            for c in colnames:
                if c in table.external_to_internal:
                    intern_colnames.append([table.external_to_internal[c]])
                    read_func = table.extern_read_funcs[c]
                    extern_resolve_colxs.append((total, read_func))
                    nesting_xs.append(total)
                    total += 1
                elif c in table.nested_to_flat:
                    nest = table.nested_to_flat[c]
                    nesting_xs.append(list(range(total, total + len(nest))))
                    intern_colnames.append(nest)
                    total += len(nest)
                else:
                    nesting_xs.append(total)
                    intern_colnames.append([c])
                    total += 1

            flat_intern_colnames = tuple(ut.flatten(intern_colnames))

            # do sql read
            # FIXME: understand unpack_scalars and keepwrap
            raw_prop_list = table.get_internal_columns(
                tbl_rowids, flat_intern_colnames, eager, nInput,
                unpack_scalars=True, keepwrap=True)
            # unpack_scalars=not
            # request_unpack)
            # print('depth(raw_prop_list) = %r' % (ut.depth_profile(raw_prop_list),))

            prop_listT = list(zip(*raw_prop_list))
            for extern_colx, read_func in extern_resolve_colxs:
                data_list = []
                for uri in prop_listT[extern_colx]:
                    try:
                        # FIXME: only do this for a localpath
                        uri1 = ut.unixjoin(table.depc.cache_dpath, uri)
                        data = read_func(uri1)
                    except Exception as ex:
                        ut.printex(ex, 'failed to load external data', iswarning=False)
                        raise
                        # FIXME
                        #data = None
                    data_list.append(data)
                prop_listT[extern_colx] = data_list

            nested_proplistT = ut.list_unflat_take(prop_listT, nesting_xs)

            for tx in ut.where([isinstance(xs, list) for xs in nesting_xs]):
                nested_proplistT[tx] = list(zip(*nested_proplistT[tx]))

            prop_list = list(zip(*nested_proplistT))

            if request_unpack:
                prop_list = [None if p is None else p[0] for p in prop_list]
        except Exception as ex:
            ut.printex(ex, 'failed in get col', keys=[
                'table.tablename',
                'request_unpack',
                'tbl_rowids',
                'colnames',
                'raw_prop_list',
                (ut.depth_profile, 'raw_prop_list'),
                'prop_listT',
                (ut.depth_profile, 'prop_listT'),
                'nesting_xs',
                'nested_proplistT',
                'prop_list'])
            raise
        return prop_list
Beispiel #8
0
    def fix_duplicates(drive):
        r"""
        for every duplicate file passing a (eg avi) filter, remove the file
        that is in the smallest directory. On a tie use the smallest dpath.
        This will filter all duplicate files in a folder into a single folder.

        but... need to look at non-duplicates in that folder and decide if they
        should be moved as well.  So, should trigger on folders that have at
        least 50% duplicate.  Might not want to move curated folders.

        Example:
            cd ~/local/scripts
            >>> from register_files import *  # NOQA
            >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/'])
            >>> drives = [Drive(root_dpath) for root_dpath in dpaths]
            >>> E = drive = drives[0]
            >>> #D, E, F = drives
        """
        print('Fixing Duplicates in %r' % (drive,))
        list_ = drive.fpath_hashX_list
        multiindex_dict_ = build_multindex(list_)
        duplicate_hashes = [
            key for key, val in six.iteritems(multiindex_dict_)
            if len(val) > 1
        ]
        duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes)
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        # Check if any dups have been removed
        still_exists = ut.unflat_map(exists, unflat_fpaths)
        unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists)
        duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1]
        # Look at duplicate files
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs)
        # Find highly coupled directories
        if True:
            coupled_dirs = []
            for fpaths in unflat_fpaths:
                #basedir = ut.longest_existing_path(commonprefix(fpaths))
                dirs = sorted(list(map(dirname, fpaths)))
                _list = list(range(len(dirs)))
                idxs = ut.upper_diag_self_prodx(_list)
                coupled_dirs.extend(list(map(tuple, ut.list_unflat_take(dirs, idxs))))
            hist_ = ut.dict_hist(coupled_dirs)
            coupled_idxs = ut.list_argsort(hist_.values())[::-1]
            most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100])
            print('Coupled fpaths: ' + ut.list_str(most_coupled, nl=True))
        print('%d unique files are duplicated' % (len(unflat_sizes),))
        #print('Duplicate sizes: ' + ut.list_str(unflat_sizes[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0::5], nl=True))
        print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths, nl=True))
        # Find duplicate directories
        dpath_list = list(drive.dpath_to_fidx.keys())
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list)
        #exists_list = list(map(exists, drive.fpath_list))
        #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list)
        fname_registry = [basename(fpath) for fpath in drive.fpath_list]
        unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list)
        def unsorted_list_hash(list_):
            return ut.hashstr27(str(sorted(list_)))
        unflat_fname_sets = list(map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000)))
        fname_based_duplicate_dpaths = []
        multiindex_dict2_ = build_multindex(unflat_fname_sets)
        fname_based_duplicate_hashes = [key for key, val in multiindex_dict2_.items() if len(val) > 1]
        print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes),))
        fname_based_duplicate_didxs = ut.dict_take(multiindex_dict2_, fname_based_duplicate_hashes)
        fname_based_duplicate_dpaths = ut.list_unflat_take(dpath_list, fname_based_duplicate_didxs)
        print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
Beispiel #9
0
    def fix_duplicates(drive):
        r"""
        for every duplicate file passing a (eg avi) filter, remove the file
        that is in the smallest directory. On a tie use the smallest dpath.
        This will filter all duplicate files in a folder into a single folder.

        but... need to look at non-duplicates in that folder and decide if they
        should be moved as well.  So, should trigger on folders that have at
        least 50% duplicate.  Might not want to move curated folders.

        Example:
            cd ~/local/scripts
            >>> from register_files import *  # NOQA
            >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/'])
            >>> drives = [Drive(root_dpath) for root_dpath in dpaths]
            >>> E = drive = drives[0]
            >>> #D, E, F = drives
        """
        print('Fixing Duplicates in %r' % (drive, ))
        list_ = drive.fpath_hashX_list
        multiindex_dict_ = build_multindex(list_)
        duplicate_hashes = [
            key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1
        ]
        duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes)
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        # Check if any dups have been removed
        still_exists = ut.unflat_map(exists, unflat_fpaths)
        unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists)
        duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1]
        # Look at duplicate files
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list,
                                           duplicate_idxs)
        # Find highly coupled directories
        if True:
            coupled_dirs = []
            for fpaths in unflat_fpaths:
                #basedir = ut.longest_existing_path(commonprefix(fpaths))
                dirs = sorted(list(map(dirname, fpaths)))
                _list = list(range(len(dirs)))
                idxs = ut.upper_diag_self_prodx(_list)
                coupled_dirs.extend(
                    list(map(tuple, ut.list_unflat_take(dirs, idxs))))
            hist_ = ut.dict_hist(coupled_dirs)
            coupled_idxs = ut.list_argsort(hist_.values())[::-1]
            most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100])
            print('Coupled fpaths: ' + ut.repr2(most_coupled, nl=True))
        print('%d unique files are duplicated' % (len(unflat_sizes), ))
        #print('Duplicate sizes: ' + ut.repr2(unflat_sizes[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0::5], nl=True))
        print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths, nl=True))
        # Find duplicate directories
        dpath_list = list(drive.dpath_to_fidx.keys())
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list)
        #exists_list = list(map(exists, drive.fpath_list))
        #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list)
        fname_registry = [basename(fpath) for fpath in drive.fpath_list]
        unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list)

        def unsorted_list_hash(list_):
            return ut.hashstr27(str(sorted(list_)))

        unflat_fname_sets = list(
            map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000)))
        fname_based_duplicate_dpaths = []
        multiindex_dict2_ = build_multindex(unflat_fname_sets)
        fname_based_duplicate_hashes = [
            key for key, val in multiindex_dict2_.items() if len(val) > 1
        ]
        print('#fname_based_duplicate_dpaths = %r' %
              (len(fname_based_duplicate_hashes), ))
        fname_based_duplicate_didxs = ut.dict_take(
            multiindex_dict2_, fname_based_duplicate_hashes)
        fname_based_duplicate_dpaths = ut.list_unflat_take(
            dpath_list, fname_based_duplicate_didxs)
        print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
Beispiel #10
0
def make_name_model(num_annots, num_names=None, verbose=True, mode=1,
                    num_scores=2, p_score_given_same=None,
                    hack_score_only=False, score_basis=None,
                    special_names=None):
    r"""
    CommandLine:
        python -m ibeis.algo.hots.bayes --exec-make_name_model --no-cnn
        python -m ibeis.algo.hots.bayes --exec-make_name_model --show --no-cnn
        python -m ibeis.algo.hots.bayes --exec-make_name_model --num-annots=3

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.hots.bayes import *  # NOQA
        >>> defaults = dict(num_annots=2, num_names=2, verbose=True)
        >>> modeltype = ut.get_argval('--modeltype', default='bayes')
        >>> kw = ut.argparse_funckw(make_name_model, defaults)
        >>> model = make_name_model(**kw)
        >>> ut.quit_if_noshow()
        >>> model.show_model(show_prior=False, show_title=False, modeltype=modeltype)
        >>> ut.show_if_requested()
    """
    if special_names is None:
        special_names = SPECIAL_BASIS_POOL

    assert mode == 1, 'only can do mode 1'
    base = ut.get_argval('--base', type_=str, default='a')
    annots = ut.chr_range(num_annots, base=base)
    # The indexes of match CPDs will not change if another annotation is added
    upper_diag_idxs = ut.colwise_diag_idxs(num_annots, 2)
    if hack_score_only:
        upper_diag_idxs = upper_diag_idxs[-hack_score_only:]

    if num_names is None:
        num_names = num_annots

    # +--- Define CPD Templates and Instantiation ---
    cpd_list = []

    # Name Factor
    name_cpd_t = pgm_ext.TemplateCPD(
        NAME_TTYPE, ('n', num_names),
        special_basis_pool=special_names)
    name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots]
    #name_cpds = [name_cpd_t.new_cpd(parents=aid, constrain_state=count)
    #             for count, aid in enumerate(annots, start=1)]
    cpd_list.extend(name_cpds)

    # Match Factor
    def match_pmf(match_type, n1, n2):
        return {
            True: {'same': 1.0, 'diff': 0.0},
            False: {'same': 0.0, 'diff': 1.0},
        }[n1 == n2][match_type]
    match_states = ['diff', 'same']
    match_cpd_t = pgm_ext.TemplateCPD(
        MATCH_TTYPE, match_states,
        evidence_ttypes=[name_cpd_t, name_cpd_t], pmf_func=match_pmf)
    #match_cpd_t.varpref = 'S'
    namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs)
    match_cpds = [match_cpd_t.new_cpd(parents=cpds)
                  for cpds in namepair_cpds]
    cpd_list.extend(match_cpds)

    # Score Factor
    score_states = list(range(num_scores))
    if score_basis is not None:
        score_states = ['%.2f' % (s,) for s in score_basis]
    if p_score_given_same is None:
        tmp = np.arange(num_scores + 1)[1:]
        tmp = np.cumsum(tmp)
        tmp = (tmp / tmp.sum())
        p_score_given_same = tmp
    def score_pmf(score_type, match_type):
        if isinstance(score_type, six.string_types):
            score_type = score_states.index(score_type)
        if match_type == 'same':
            return p_score_given_same[score_type]
        else:
            return p_score_given_same[-(score_type + 1)]
    score_cpd_t = pgm_ext.TemplateCPD(
        SCORE_TTYPE, score_states,
        evidence_ttypes=[match_cpd_t], pmf_func=score_pmf)
    #match_cpd_t.varpref = 'P'
    score_cpds = [score_cpd_t.new_cpd(parents=cpds)
                  for cpds in zip(match_cpds)]
    cpd_list.extend(score_cpds)

    with_humans = False
    if with_humans:
        human_states = ['diff', 'same']
        human_cpd_t = pgm_ext.TemplateCPD(
            'human', human_states,
            evidence_ttypes=[match_cpd_t], pmf_func=[[.9, .1], [.1, .9]])
        human_cpds = [human_cpd_t.new_cpd(parents=cpds)
                      for cpds in zip(match_cpds)]
        cpd_list.extend(human_cpds)

    with_rank = False  # Rank depends on dependant scores
    if with_rank:
        rank_states = ['0', '1', '2', '3']
        rank_cpd_t = pgm_ext.TemplateCPD(
            'rank', rank_states,
            evidence_ttypes=[match_cpd_t], pmf_func=None)
        rank_cpds = [rank_cpd_t.new_cpd(parents=cpds)
                      for cpds in zip(match_cpds)]
        cpd_list.extend(rank_cpds)

    # L___ End CPD Definitions ___

    print('score_cpds = %r' % (ut.list_getattr(score_cpds, 'variable'),))

    # Make Model
    model = pgm_ext.define_model(cpd_list)
    model.num_names = num_names

    if verbose:
        model.print_templates(ignore_ttypes=[MATCH_TTYPE])
    return model