Example #1
0
    def test_group_first(self):
        from pyglottolog.util import group_first

        for key, items in group_first([(1, 2), (1, 3)]):
            self.assertEqual(key, 1)
            self.assertEqual(len(list(items)), 2)
            break
Example #2
0
    def test_group_first(self):
        from pyglottolog.util import group_first

        for key, items in group_first([(1, 2), (1, 3)]):
            self.assertEqual(key, 1)
            self.assertEqual(len(list(items)), 2)
            break
Example #3
0
 def show_merges(self):
     with self.connect() as conn:
         cursor = conn.execute('SELECT hash, refid, filename, bibkey '
         'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry '
         'WHERE hash = e.hash AND refid != e.refid) '
         'ORDER BY hash, refid DESC, filename, bibkey')
         for hash, group in group_first(cursor):
             self.print_group(conn, group)
             new = self._merged_entry(self._entrygrp(conn, hash), raw=True)
             cand = [(ri, self._merged_entry(self._entrygrp(conn, ri), raw=True))
                 for ri in unique(ri for hs, ri, fn, bk in group)]
             old = min(cand, key=lambda p: distance(new, p[1]))[0]
             print('-> %s\n' % old)
Example #4
0
 def show_splits(self):
     with self.connect() as conn:
         cursor = conn.execute('SELECT refid, hash, filename, bibkey '
         'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry '
         'WHERE refid = e.refid AND hash != e.hash) '
         'ORDER BY refid, hash, filename, bibkey')
         for refid, group in group_first(cursor):
             self.print_group(conn, group)
             old = self._merged_entry(self._entrygrp(conn, refid), raw=True)
             cand = [(hs, self._merged_entry(self._entrygrp(conn, hs), raw=True))
                 for hs in unique(hs for ri, hs, fn, bk in group)]
             new = min(cand, key=lambda p: distance(old, p[1]))[0]
             print('-> %s\n' % new)
Example #5
0
 def show_merges(self):
     with self.connect() as conn:
         cursor = conn.execute(
             'SELECT hash, refid, filename, bibkey '
             'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry '
             'WHERE hash = e.hash AND refid != e.refid) '
             'ORDER BY hash, refid DESC, filename, bibkey')
         for hash, group in group_first(cursor):
             self.print_group(conn, group)
             new = self._merged_entry(self._entrygrp(conn, hash), raw=True)
             cand = [(ri,
                      self._merged_entry(self._entrygrp(conn, ri),
                                         raw=True))
                     for ri in unique(ri for hs, ri, fn, bk in group)]
             old = min(cand, key=lambda p: distance(new, p[1]))[0]
             print('-> %s\n' % old)
Example #6
0
 def show_splits(self):
     with self.connect() as conn:
         cursor = conn.execute(
             'SELECT refid, hash, filename, bibkey '
             'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry '
             'WHERE refid = e.refid AND hash != e.hash) '
             'ORDER BY refid, hash, filename, bibkey')
         for refid, group in group_first(cursor):
             self.print_group(conn, group)
             old = self._merged_entry(self._entrygrp(conn, refid), raw=True)
             cand = [(hs,
                      self._merged_entry(self._entrygrp(conn, hs),
                                         raw=True))
                     for hs in unique(hs for ri, hs, fn, bk in group)]
             new = min(cand, key=lambda p: distance(old, p[1]))[0]
             print('-> %s\n' % new)
Example #7
0
def assign_ids(conn, verbose=False):
    merged_entry, entrygrp = Database._merged_entry, Database._entrygrp

    allhash, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry '
        'WHERE hash IS NULL)').fetchone()
    assert allhash

    print('%d entries' % conn.execute('UPDATE entry SET id = NULL, srefid = refid').rowcount)

    # resolve splits: srefid = refid only for entries from the most similar hash group
    nsplit = 0
    cursor = conn.execute('SELECT refid, hash, filename, bibkey FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE refid = e.refid AND hash != e.hash) '
        'ORDER BY refid, hash, filename, bibkey')
    for refid, group in group_first(cursor):
        old = merged_entry(entrygrp(conn, refid), raw=True)
        nsplit += len(group)
        cand = [(hs, merged_entry(entrygrp(conn, hs), raw=True))
            for hs in unique(hs for ri, hs, fn, bk in group)]
        new = min(cand, key=lambda p: distance(old, p[1]))[0]
        separated = conn.execute('UPDATE entry SET srefid = NULL WHERE refid = ? AND hash != ?',
            (refid, new)).rowcount
        if verbose:
            for row in group:
                print(row)
            for ri, hs, fn, bk in group:
                print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk))
            print('-> %s' % new)
            print('%d: %d separated from %s\n' % (refid, separated, new))
    print('%d splitted' % nsplit)
    
    nosplits, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE srefid = e.srefid AND hash != e.hash))').fetchone()
    assert nosplits

    # resolve merges: id = srefid of the most similar srefid group
    nmerge = 0
    cursor = conn.execute('SELECT hash, srefid, filename, bibkey FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND srefid != e.srefid) '
        'ORDER BY hash, srefid DESC, filename, bibkey')
    for hash, group in group_first(cursor):
        new = merged_entry(entrygrp(conn, hash), raw=True)
        nmerge += len(group)
        cand = [(ri, merged_entry(entrygrp(conn, ri), raw=True))
            for ri in unique(ri for hs, ri, fn, bk in group)]
        old = min(cand, key=lambda p: distance(new, p[1]))[0]
        merged = conn.execute('UPDATE entry SET id = ? WHERE hash = ? AND srefid != ?',
            (old, hash, old)).rowcount
        if verbose:
            for row in group:
                print(row)
            for hs, ri, fn, bk in group:
                print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk))
            print('-> %s' % old)
            print('%s: %d merged into %d\n' % (hash, merged, old))
    print('%d merged' % nmerge)

    # unchanged entries
    print('%d unchanged' % conn.execute('UPDATE entry SET id = srefid '
        'WHERE id IS NULL AND srefid IS NOT NULL').rowcount)

    nomerges, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND id != e.id))').fetchone()
    assert nomerges

    # identified
    print('%d identified (new/separated)' % conn.execute('UPDATE entry '
        'SET id = (SELECT id FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL) '
        'WHERE refid IS NULL AND id IS NULL AND EXISTS '
        '(SELECT 1 FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL)').rowcount)

    # assign new ids to hash groups of separated/new entries
    nextid, = conn.execute('SELECT coalesce(max(refid), 0) + 1 FROM entry').fetchone()
    cursor = conn.execute('SELECT hash FROM entry WHERE id IS NULL GROUP BY hash ORDER BY hash')
    print('%d new ids (new/separated)' % conn.executemany('UPDATE entry SET id = ? WHERE hash = ?',
        ((id, hash) for id, (hash,) in enumerate(cursor, nextid))).rowcount)

    assert allid(conn)
    assert onetoone(conn)

    # supersede relation
    superseded, = conn.execute('SELECT count(*) FROM entry WHERE id != srefid').fetchone()
    print('%d supersede pairs' % superseded)
Example #8
0
 def _show(self, sql):
     with self.connect() as conn:
         cursor = conn.execute(sql)
         for hash, group in group_first(cursor):
             self.print_group(conn, group)
             print()
Example #9
0
def assign_ids(conn, verbose=False):
    merged_entry, entrygrp = Database._merged_entry, Database._entrygrp

    allhash, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry '
                            'WHERE hash IS NULL)').fetchone()
    assert allhash

    print('%d entries' %
          conn.execute('UPDATE entry SET id = NULL, srefid = refid').rowcount)

    # resolve splits: srefid = refid only for entries from the most similar hash group
    nsplit = 0
    cursor = conn.execute(
        'SELECT refid, hash, filename, bibkey FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE refid = e.refid AND hash != e.hash) '
        'ORDER BY refid, hash, filename, bibkey')
    for refid, group in group_first(cursor):
        old = merged_entry(entrygrp(conn, refid), raw=True)
        nsplit += len(group)
        cand = [(hs, merged_entry(entrygrp(conn, hs), raw=True))
                for hs in unique(hs for ri, hs, fn, bk in group)]
        new = min(cand, key=lambda (hs, fields): distance(old, fields))[0]
        separated = conn.execute(
            'UPDATE entry SET srefid = NULL WHERE refid = ? AND hash != ?',
            (refid, new)).rowcount
        if verbose:
            for row in group:
                print(row)
            for ri, hs, fn, bk in group:
                print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk))
            print('-> %s' % new)
            print('%d: %d separated from %s\n' % (refid, separated, new))
    print('%d splitted' % nsplit)

    nosplits, = conn.execute(
        'SELECT NOT EXISTS (SELECT 1 FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE srefid = e.srefid AND hash != e.hash))'
    ).fetchone()
    assert nosplits

    # resolve merges: id = srefid of the most similar srefid group
    nmerge = 0
    cursor = conn.execute(
        'SELECT hash, srefid, filename, bibkey FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND srefid != e.srefid) '
        'ORDER BY hash, srefid DESC, filename, bibkey')
    for hash, group in group_first(cursor):
        new = merged_entry(entrygrp(conn, hash), raw=True)
        nmerge += len(group)
        cand = [(ri, merged_entry(entrygrp(conn, ri), raw=True))
                for ri in unique(ri for hs, ri, fn, bk in group)]
        old = min(cand, key=lambda (ri, fields): distance(new, fields))[0]
        merged = conn.execute(
            'UPDATE entry SET id = ? WHERE hash = ? AND srefid != ?',
            (old, hash, old)).rowcount
        if verbose:
            for row in group:
                print(row)
            for hs, ri, fn, bk in group:
                print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk))
            print('-> %s' % old)
            print('%s: %d merged into %d\n' % (hash, merged, old))
    print('%d merged' % nmerge)

    # unchanged entries
    print('%d unchanged' %
          conn.execute('UPDATE entry SET id = srefid '
                       'WHERE id IS NULL AND srefid IS NOT NULL').rowcount)

    nomerges, = conn.execute(
        'SELECT NOT EXISTS (SELECT 1 FROM entry AS e '
        'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND id != e.id))'
    ).fetchone()
    assert nomerges

    # identified
    print('%d identified (new/separated)' % conn.execute(
        'UPDATE entry '
        'SET id = (SELECT id FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL) '
        'WHERE refid IS NULL AND id IS NULL AND EXISTS '
        '(SELECT 1 FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL)'
    ).rowcount)

    # assign new ids to hash groups of separated/new entries
    nextid, = conn.execute(
        'SELECT coalesce(max(refid), 0) + 1 FROM entry').fetchone()
    cursor = conn.execute(
        'SELECT hash FROM entry WHERE id IS NULL GROUP BY hash ORDER BY hash')
    print('%d new ids (new/separated)' %
          conn.executemany('UPDATE entry SET id = ? WHERE hash = ?', (
              (id, hash)
              for id, (hash, ) in enumerate(cursor, nextid))).rowcount)

    assert allid(conn)
    assert onetoone(conn)

    # supersede relation
    superseded, = conn.execute(
        'SELECT count(*) FROM entry WHERE id != srefid').fetchone()
    print('%d supersede pairs' % superseded)
Example #10
0
 def _show(self, sql):
     with self.connect() as conn:
         cursor = conn.execute(sql)
         for hash, group in group_first(cursor):
             self.print_group(conn, group)
             print()
Example #11
0
def test_group_first():
    key, items = next(util.group_first([(1, 2), (1, 3)]))
    assert key, list(items) == (1, [(1, 2), (1, 3)])