Ejemplo n.º 1
0
    def isect_info(self, other):
        set1 = set(self.rel_fpath_list)
        set2 = set(other.rel_fpath_list)

        set_comparisons = ut.odict([
            ('s1', set1),
            ('s2', set2),
            ('union', set1.union(set2)),
            ('isect', set1.intersection(set2)),
            ('s1 - s2', set1.difference(set2)),
            ('s2 - s1', set1.difference(set1)),
        ])
        stat_stats = ut.map_vals(len, set_comparisons)
        print(ut.repr4(stat_stats))
        return set_comparisons

        if False:
            idx_lookup1 = ut.make_index_lookup(self.rel_fpath_list)
            idx_lookup2 = ut.make_index_lookup(other.rel_fpath_list)

            uuids1 = ut.take(self.uuids,
                             ut.take(idx_lookup1, set_comparisons['union']))
            uuids2 = ut.take(other.uuids,
                             ut.take(idx_lookup2, set_comparisons['union']))

            uuids1 == uuids2
Ejemplo n.º 2
0
 def get_annot_viewpoint_stats(ibs, aid_list):
     annots = ibs.annots(aid_list)
     viewcode2_nAnnots = ut.order_dict_by(
         ut.map_vals(len, annots.group_items(annots.viewpoint_code)),
         list(ibs.const.VIEW.CODE_TO_INT.keys()) + [None],
     )
     return viewcode2_nAnnots
Ejemplo n.º 3
0
 def get_annot_qual_stats(ibs, aid_list):
     annots = ibs.annots(aid_list)
     qualtext2_nAnnots = ut.order_dict_by(
         ut.map_vals(len, annots.group_items(annots.quality_texts)),
         list(ibs.const.QUALITY_TEXT_TO_INT.keys()),
     )
     return qualtext2_nAnnots
Ejemplo n.º 4
0
    def oracle_review(sim):
        queue_params = {
            'pos_diameter': None,
            'neg_diameter': None,
        }
        infr = sim.infr
        prev = infr.verbose
        infr.verbose = 0
        # rng = np.random.RandomState(0)
        infr = sim.infr
        primary_truth = sim.primary_truth
        review_edges = infr.generate_reviews(**queue_params)
        max_reviews = 1000
        for count, (aid1, aid2) in enumerate(ut.ProgIter(review_edges)):
            state = primary_truth.loc[(aid1, aid2)].idxmax()
            tags = []
            infr.add_feedback(aid1,
                              aid2,
                              state,
                              tags,
                              apply=True,
                              rectify=False,
                              user_id='oracle',
                              confidence='absolutely_sure')
            if count > max_reviews:
                break
        infr.verbose = prev

        sim.results['max_reviews'] = max_reviews

        n_clusters, n_inconsistent = infr.relabel_using_reviews(rectify=False)
        assert n_inconsistent == 0, 'should not create any inconsistencies'

        sim.results['n_user_clusters'] = n_clusters
        # infr.apply_review_inference()

        curr_decisions = infr.edge_attr_df('decision')
        curr_truth = primary_truth.loc[curr_decisions.index].idxmax(axis=1)
        n_user_mistakes = curr_decisions != curr_truth
        sim.results['n_user_mistakes'] = sum(n_user_mistakes)

        gt_clusters = ut.group_pairs(infr.gen_node_attrs('orig_name_label'))
        curr_clusters = ut.group_pairs(infr.gen_node_attrs('name_label'))

        compare_results = compare_groups(list(gt_clusters.values()),
                                         list(curr_clusters.values()))
        sim.results.update(ut.map_vals(len, compare_results))

        common_per_num = ut.group_items(compare_results['common'],
                                        map(len, compare_results['common']))
        sumafter = 3
        greater = [i for i in common_per_num.keys() if i > sumafter]
        common_per_num['>%s' % sumafter] = ut.flatten(
            ut.take(common_per_num, greater))
        ut.delete_keys(common_per_num, greater)
        for k, v in common_per_num.items():
            sim.results['common@' + str(k)] = len(v)

        sim.results['n_names_common'] = len(compare_results['common'])
Ejemplo n.º 5
0
    def check_baseline_results(sim):
        import networkx as nx
        infr = sim.infr
        n_names_possible = 0
        real_groups = ut.group_pairs(infr.gen_node_attrs('orig_name_label'))
        possible_clusters = []
        for nid, nodes in real_groups.items():
            if len(nodes) == 1:
                possible_clusters.append(nodes)
                n_names_possible += 1
                continue
            cc_cand_edges = list(ut.nx_edges_between(infr.graph, nodes))
            cc = ut.nx_from_node_edge(nodes, cc_cand_edges)
            mst = nx.minimum_spanning_tree(cc)
            ccs = list(nx.connected_components(mst))
            possible_clusters.extend(ccs)
            n_names_possible += (len(ccs))

        sumafter = 3

        best_possible_compare_results = compare_groups(
            list(real_groups.values()), list(possible_clusters))
        possible_per_num = ut.map_vals(
            len,
            ut.group_items(best_possible_compare_results['common'],
                           map(len, best_possible_compare_results['common'])))
        greater = [i for i in possible_per_num.keys() if i > sumafter]
        possible_per_num['>%s' % sumafter] = sum(
            ut.take(possible_per_num, greater))
        ut.delete_keys(possible_per_num, greater)
        for k, v in possible_per_num.items():
            sim.results['possible@' + str(k)] = v
        sim.results['possible'] = len(best_possible_compare_results['common'])

        # Measure the number of real names in the test (per number of annots)
        real_per_num = ut.dict_hist(map(len, real_groups.values()))
        greater = [i for i in real_per_num.keys() if i > sumafter]
        real_per_num['>%s' % sumafter] = sum(ut.take(real_per_num, greater))
        ut.delete_keys(real_per_num, greater)
        for k, v in real_per_num.items():
            sim.results['real@' + str(k)] = v

        sim.results['n_names_possible'] = n_names_possible
        sim.results['n_names_real'] = len(real_groups)
        sim.results['real'] = len(real_groups)
Ejemplo n.º 6
0
    def __init__(verif, infr):
        verif.rng = np.random.RandomState(4033913)
        verif.dummy_params = {
            NEGTV: {
                'mean': 0.2,
                'std': 0.25
            },
            POSTV: {
                'mean': 0.85,
                'std': 0.2
            },
            INCMP: {
                'mean': 0.15,
                'std': 0.1
            },
        }
        verif.score_dist = randn

        verif.infr = infr
        verif.orig_nodes = set(infr.aids)
        verif.orig_labels = infr.get_node_attrs('orig_name_label')
        verif.orig_groups = ut.invert_dict(verif.orig_labels, False)
        verif.orig_groups = ut.map_vals(set, verif.orig_groups)
Ejemplo n.º 7
0
    def isect_info(self, other):
        set1 = set(self.rel_fpath_list)
        set2 = set(other.rel_fpath_list)

        set_comparisons = ut.odict([
            ('s1', set1),
            ('s2', set2),
            ('union', set1.union(set2)),
            ('isect', set1.intersection(set2)),
            ('s1 - s2', set1.difference(set2)),
            ('s2 - s1', set1.difference(set1)),
        ])
        stat_stats = ut.map_vals(len, set_comparisons)
        print(ut.repr4(stat_stats))
        return set_comparisons

        if False:
            idx_lookup1 = ut.make_index_lookup(self.rel_fpath_list)
            idx_lookup2 = ut.make_index_lookup(other.rel_fpath_list)

            uuids1 = ut.take(self.uuids, ut.take(idx_lookup1, set_comparisons['union']))
            uuids2 = ut.take(other.uuids, ut.take(idx_lookup2, set_comparisons['union']))

            uuids1 == uuids2
Ejemplo n.º 8
0
 def nbytes_info(X):
     size_info = ut.map_vals(ut.get_object_nbytes, X.__dict__)
     return size_info
Ejemplo n.º 9
0
def clean_tags():
    zotero = get_libzotero()
    # dict of all zotero items
    # items = zotero.index
    # get sql cursor
    cur = zotero.cur
    if False:
        sorted(ut.util_sqlite.get_tablenames(cur))
        ut.print_database_structure(cur)
        # Debug info about tags table in sql

        # The `tags` table stores all tags
        # The itemTags table stores the association between items and tags
        ut.get_table_columninfo_list(cur, 'fields')
        # ut.get_table_columninfo_list(cur, 'relations')
        ut.get_table_columninfo_list(cur, 'fieldsCombined')

        ut.get_table_columninfo_list(cur, 'itemData')
        ut.get_table_columninfo_list(cur, 'itemDataValues')

        ut.get_table_columninfo_list(cur, 'tags')
        ut.get_table_columninfo_list(cur, 'itemTags')

    import pandas as pd
    pd.options.display.max_colwidth = 40
    pd.options.display.max_rows = 20
    def pandas_sql(table, columns):
        return pd.DataFrame(ut.get_table_rows(cur, table, columns),
                            columns=columns)

    item_df = pandas_sql('items', ('itemID', 'itemTypeID', 'libraryID', 'key')).set_index('itemID', drop=False)
    tags_df = pandas_sql('tags', ('tagID', 'name', 'type', 'libraryID', 'key')).set_index('tagID', drop=False)
    itemData_df = pandas_sql('itemData', ('itemID', 'fieldID', 'valueID'))

    itemTag_df = pandas_sql('itemTags', ('itemID', 'tagID'))

    itemDataValues_df = pandas_sql('itemDataValues', ('valueID', 'value')).set_index('valueID')
    field_df = pandas_sql('fields', ('fieldID', 'fieldName', 'fieldFormatID')).set_index('fieldID')

    itemData_df['value'] = itemDataValues_df['value'].loc[itemData_df['valueID'].values].values
    itemData_df['fieldName'] = field_df['fieldName'].loc[itemData_df['fieldID'].values].values

    titles = itemData_df[itemData_df['fieldName'] == 'title']
    assert len(ut.unique(ut.map_vals(len, titles.groupby('itemID').indices).values())) == 1

    # itemTag_df.groupby('itemID').count()
    # Find how often each tag is used
    tagid_to_count = itemTag_df.groupby('tagID').count()
    tagid_to_count = tagid_to_count.rename(columns={'itemID': 'nItems'})
    tagid_to_count['name'] = tags_df.loc[tagid_to_count.index]['name']
    tagid_to_count = tagid_to_count.sort_values('nItems')

    bad_tags = tagid_to_count[tagid_to_count['nItems'] == 1]

    tagid_to_count['tag_ncharsize'] = tagid_to_count['name'].apply(len)
    tagid_to_count = tagid_to_count.sort_values('tag_ncharsize')
    bad_tags = tagid_to_count[tagid_to_count['tag_ncharsize'] > 25]['name'].values.tolist()

    def clean_tags2():
        api_key = 'fBDBqRPwW9O3mYyNLiksBKZy'
        base_url = 'https://api.zotero.org'
        library_id = '1279414'
        library_type = 'user'
        from pyzotero import zotero
        zot = zotero.Zotero(library_id, library_type, api_key)

        for chunk in ut.ProgChunks(bad_tags, 50):
            zot.delete_tags(*chunk)

    if False:
        api_key = 'fBDBqRPwW9O3mYyNLiksBKZy'
        base_url = 'https://api.zotero.org'
        user_id = '1279414'
        userOrGroupPrefix = '/users/' + user_id
        params = {'v': 3, 'key': api_key}

        items_resp = requests.get(base_url + userOrGroupPrefix + '/items', params=params)
        print(items_resp.content)
        print(items_resp)

        json_tags = []
        get_url = base_url + userOrGroupPrefix + '/tags'
        while True:
            print('get_url = %r' % (get_url,))
            tag_resp = requests.get(get_url, params=params)
            if tag_resp.status_code != 200:
                break
            json_tags.extend(tag_resp.json())
            if 'next' in tag_resp.links:
                get_url = tag_resp.links['next']['url']
            else:
                break

        version_to_tags = ut.ddict(list)
        bad_tags = []
        for tag in ut.ProgIter(json_tags, label='parsing tags'):
            # x = requests.get(tag['links']['self']['href'], params=params)
            if tag['meta']['numItems'] == 1:
                import urllib2
                try:
                    bad_tags.append(urllib2.quote(tag['tag']))
                except Exception as ex:
                    print('cant encode tag=%r' % (tag,))
                    pass

        for chunk in ut.ProgIter(ut.ichunks(bad_tags, 50), length=len(bad_tags) / 50):
            search_url = base_url + userOrGroupPrefix + '/items?tag=' + ' || '.join(chunk)
            r = requests.get(search_url, params=params)
            matching_items = r.json()
            # assert len(matching_items) == 1
            for item in matching_items:
                version = item['version']
            version_to_tags[item['version']].append(tag['tag'])

        # DELETE MULTIPLE TAGS
        import requests
        for chunk in ut.ichunks(bad_tags['name'], 50):
            import urllib2
            encoded_chunk = []
            for t in chunk:
                try:
                    encoded_chunk.append(urllib2.quote(t))
                except Exception:
                    print(t)
            suffix = ' || '.join(encoded_chunk)
            delete_url = base_url + userOrGroupPrefix + '/tags?' + suffix
            print('delete_url = %r' % (delete_url,))
            resp = requests.delete(delete_url, params=params)

        bad_tags = tagid_to_count[tagid_to_count['nItems'] == 1]
        bad_tags['tagID'] = bad_tags.index
        for tagid in bad_tags:
            delete from itemTags where tagID in (select tagID from tags where type=1);
        pass
        for name in k['name'].values.tolist()
    item_df['title'] = titles.set_index('itemID')['value']
    for idx, item in zotero.index.items():
        sql_title = item_df.loc[item.id]['title']
        if item.title != sql_title:
            if pd.isnull(sql_title) and item.title is not None:
                print(item.__dict__)
                print(item_df.loc[item.id])
                print('item.title = %r' % (item.title,))
                print('sql_title = %r' % (sql_title,))
                assert False

    duplicate_tags = [
        (name, idxs) for name, idxs in tags_df.groupby('name', sort=True).indices.items() if len(idxs) > 2
    ]
    tagname_to_tagid = tags_df.groupby('name', sort=True).first()
    new_to_oldtags = {}
    # Determine which tagi to use for each name
    for tagname, idxs in duplicate_tags:
        tags_subdf = tags_df.iloc[idxs]
        mapping = itemTag_df[itemTag_df['tagID'].isin(tags_subdf['tagID'])]
        tag_hist = mapping.groupby('tagID').count()
        best_tagid = tag_hist['itemID'].idxmax()

        new_to_oldtags[best_tagid] = set(tag_hist['itemID'].values) - {best_tagid}

        tagname_to_tagid.loc[tagname] = tags_df.loc[best_tagid]
        # for col in tagname_to_tagid.columns:
        #     tagname_to_tagid.loc[tagname][col] = tags_df.loc[best_tagid][col]
        # tags_df.loc[best_tagid]

    if False:
        # Update tagIds
        for newid, oldids in new_to_oldtags.items():
            for oldid in oldids:
                # cur.execute('SELECT itemID, tagID FROM itemTags WHERE tagID=?', (oldid,))
                import sqlite3
                try:
                    cmd = 'UPDATE itemTags SET tagID=? WHERE tagID=?'
                    args = (newid, oldid)
                    print('(%s) args = %r' % (cmd, args,))
                    cur.execute(cmd, args)
                    print(cur.fetchall())
                except sqlite3.IntegrityError:
                    print('error')
                    pass

    # tags_df.groupby('name', sort=True)

    # itemTag_df.groupby('itemID')
    # duptags = tags_df.iloc[tags_df.groupby('name', sort=True).indices['animals']]
    # duptags['tagID']
    # flags = itemTag_df['tagID'].isin(duptags['tagID'])
    # dup_rel = itemTag_df[flags]
    # item_df['title'].loc[dup_rel['itemID']].values
    # tags_df.iloc[tags_df.groupby('name', sort=True).indices['animals']]

    # tags_df[tags_df['type'] == 1]
    # tags_df[tags_df['type'] == 0]
    # tags_df['libraryID'].unique()
    # tags_df['type'].unique()

    '''
    SELECT
    SELECT FROM itemTags WHERE name in (animals)
    '''

    item_tag_pairs = ut.get_table_rows(cur, 'itemTags', ('itemID', 'tagID'))
    # Group tags by item
    itemid_to_tagids = ut.group_pairs(item_tag_pairs)
    # Group items by tags
    tagid_to_itemids = ut.group_pairs(map(tuple, map(reversed, item_tag_pairs)))

    # mapping from tagid to name
    tagid_to_name = dict(ut.get_table_rows(cur, 'tags', ('tagID', 'name')))

    tagid_freq = list(ut.sort_dict(ut.map_vals(len, tagid_to_itemids), 'vals').items())
    ut.sort_dict(ut.map_vals(sum, ut.group_pairs([(freq, tagid_to_name.get(tagid, tagid)) for tagid, freq in tagid_freq])), 'vals')
    tagname_freq = ut.map_keys(lambda k: tagid_to_name.get(k, k), tagid_freq)
Ejemplo n.º 10
0
    print('g = {!r}'.format(g))

ignore = {
    'conference': ['eventtitle', 'doi', 'urldate', 'location', 'volume'],
    'journal': ['doi', 'urldate', 'issue', 'number', 'volume'],
    'book': ['urldate'],
    'thesis': ['urldate'],
    'online': ['type'],
    'report': ['urldate'],
}
for v in ignore.values():
    v.append('eprinttype')
    v.append('eprint')

print('Entry type freq:')
print(ut.map_vals(len, entrytypes))

for e, g in entrytypes.items():
    print('\n --- TYPE = %r' % (e.upper(), ))
    g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]
    missing_cols = g.columns[np.any(pd.isnull(g), axis=0)]
    if e in ignore:
        missing_cols = missing_cols.difference(ignore[e])
    print('missing_cols = {!r}'.format(missing_cols.tolist()))
    for col in missing_cols:
        print('col = {!r}'.format(col))
        print(g[pd.isnull(g[col])].index.tolist())

for e, g in entrytypes.items():
    print('e = %r' % (e, ))
    g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]